# convert data to fasttext format

In [14]:
%load_ext autoreload
%autoreload 2
from transformers import BertModel, DistilBertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np
from datasets import load_from_disk
import pickle as pkl
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegressionCV
from collections import defaultdict
from copy import deepcopy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# prepare data for fasttext

In [4]:
dataset = datasets.load_dataset('sst2')
dataset

Using custom data configuration default
Reusing dataset sst2 (/tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [31]:
for split in ['train', 'validation']:
    for subsample in [100, 1000, -1]:
        d = dataset[split]
        if subsample and split == 'train':
            d = d[:subsample]
        vals = ('__label__' + pd.Series(d['label']).astype(str) + ' ' + pd.Series(d['sentence']).astype(str)).values
        s = '\n'.join(vals)
        with open(f'data/sst2-fasttext/{split}_sst_{subsample}_sst.txt', 'w') as f:
            f.write(s)

# fit fasttext model

In [34]:
import fasttext

In [79]:
m = fasttext.train_supervised('data/sst2-fasttext/train_sst_-1_sst.txt')

Read 0M words
Number of words:  14817
Number of labels: 2
Progress: 100.0% words/sec/thread: 2131660 lr:  0.000000 avg.loss:  0.273194 ETA:   0h 0m 0s


In [80]:
m.test('data/sst2-fasttext/train_sst_100_sst.txt')

(100, 0.97, 0.97)

In [81]:
def pred(x):
    return 1 * ('1' in m.predict(x)[0][0])

In [82]:
preds = np.array(list(map(pred, dataset['validation']['sentence']))).astype(int)
labels = np.array(dataset['validation']['label']).astype(int)
np.mean(preds == labels)

0.8211009174311926

In [83]:
preds

array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,