# convert data to fasttext format

In [21]:
%load_ext autoreload
%autoreload 2
from transformers import BertModel, DistilBertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np
from datasets import load_from_disk
import pickle as pkl
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegressionCV
from collections import defaultdict
import os
from os.path import join as oj
from copy import deepcopy
import fasttext
import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# prepare data for fasttext

In [3]:
dataset = datasets.load_dataset('sst2')
for split in ['train', 'validation']:
    for subsample in [100, 1000, -1]:
        d = dataset[split]
        if subsample and split == 'train':
            d = d[:subsample]
        vals = ('__label__' + pd.Series(d['label']).astype(str) + ' ' + pd.Series(d['sentence']).astype(str)).values
        s = '\n'.join(vals)
        with open(f'data/sst2-fasttext/{split}_sst_{subsample}_sst.txt', 'w') as f:
            f.write(s)

Using custom data configuration default
Reusing dataset sst2 (/tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

# fit fasttext model

In [None]:
def pred(x):
    return 1 * ('1' in m.predict(x)[0][0])


for subsample in tqdm([100, 1000, -1]):
    for ngrams in range(1, 10):
        r = {
            'checkpoint': 'fasttext',
            'ngrams': ngrams,
            'subsample': subsample,
            'all': 'all',
        }

        # saving
        results_dir = oj(config.results_dir, 'sst2')
        dir_name = f"ngram={ngrams}_" + 'sub=' + str(subsample) + '_' + r['checkpoint'] + '-all'
        save_dir = oj(results_dir, dir_name)
        if os.path.exists(save_dir):
            print('aready ran', save_dir)


        m = fasttext.train_supervised(f'data/sst2-fasttext/train_sst_{subsample}_sst.txt',
                                      wordNgrams=ngrams)
        # m.test('data/sst2-fasttext/train_sst_100_sst.txt')
        preds = np.array(list(map(pred, dataset['train']['sentence']))).astype(int)
        labels = np.array(dataset['train']['label']).astype(int)
        r['acc_train'] = np.mean(preds == labels)
        
        preds = np.array(list(map(pred, dataset['validation']['sentence']))).astype(int)
        labels = np.array(dataset['validation']['label']).astype(int)
        r['acc_val'] = np.mean(preds == labels)
#         r['model'] = m
        r['num_features'] = len(m.words)
        
        os.makedirs(save_dir, exist_ok=True)
        with open(oj(save_dir, 'results.pkl'), 'wb') as f:
            pkl.dump(r, f)