In [1]:
%load_ext autoreload
%autoreload 2
from transformers import BertModel, BertConfig
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np
from datasets import load_from_disk
import pickle as pkl
import sklearn
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from collections import defaultdict
from copy import deepcopy
import pandas as pd

Loosely following [this tutorial](https://amueller.github.io/aml/05-advanced-topics/13-text-data.html).

In [2]:
dataset = datasets.load_dataset('sst2')
nlp = English()
simple_tokenizer = lambda x: [str(x) for x in nlp.tokenizer(x)] # for our word-finding

Using custom data configuration default
Reusing dataset sst2 (/tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def transform(vectorizer, dataset):
    X_train = vectorizer.transform(dataset['train']['sentence'])
    X_val = vectorizer.transform(dataset['validation']['sentence'])
    # X_test = vectorizer.transform(dataset['test']['sentence'])
    return X_train, X_val #, X_test


def fit_and_score(X_train, X_val, dataset, r):
    # model
    m = LogisticRegressionCV()
    m.fit(X_train, dataset['train']['label'])
    r['model'].append(deepcopy(m))
    
    # performance
    acc_train = m.score(X_train, dataset['train']['label'])
    acc_val = m.score(X_val, dataset['validation']['label'])
    print('\ttrain', acc_train.round(3))
    print('\tval', acc_val.round(3))
    r['acc_train'].append(acc_train)
    r['acc_val'].append(acc_val)

r = defaultdict(list)
for i, vectorizer in enumerate([
    CountVectorizer(tokenizer=simple_tokenizer), # unigrams only
    TfidfVectorizer(tokenizer=simple_tokenizer),
    CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 2)),
    TfidfVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 2)),    
    CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 3)),
    TfidfVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 3)),   
    CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 4)),
    TfidfVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 4)),       
    CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 5)),
    TfidfVectorizer(tokenizer=simple_tokenizer, ngram_range=(1, 5)),           
]):
    print(vectorizer)
    vectorizer.fit(dataset['train']['sentence'])
    X_train, X_val, X_test = transform(vectorizer, dataset)
    print('\tshapes', X_train.shape, X_val.shape)
    print('\tvocab ex.', list(vectorizer.vocabulary_.keys())[:10])
    # r['vectorizer'].append(deepcopy(vectorizer))
    r['model_type'].append(str(type(vectorizer)))
    r['ngram_range'].append(vectorizer.ngram_range)
    r['num_features'].append(X_train.shape[1])
    
    fit_and_score(X_train, X_val, dataset, r)
    
    pkl.dump(r, open(f'results/r_{i}.pkl', 'wb'))

In [10]:
df = pd.DataFrame.from_dict(r).sort_values('ngram_range')
df

Unnamed: 0,vectorizer_type,ngram_range,num_features,model,acc_train,acc_val
0,<class 'sklearn.feature_extraction.text.CountV...,"(1, 1)",13887,LogisticRegressionCV(),0.963281,0.807339
1,<class 'sklearn.feature_extraction.text.TfidfV...,"(1, 1)",13887,LogisticRegressionCV(),0.958856,0.811927
2,<class 'sklearn.feature_extraction.text.CountV...,"(1, 2)",86388,LogisticRegressionCV(),0.991834,0.821101
3,<class 'sklearn.feature_extraction.text.TfidfV...,"(1, 2)",86388,LogisticRegressionCV(),0.992398,0.825688
4,<class 'sklearn.feature_extraction.text.CountV...,"(1, 3)",195188,LogisticRegressionCV(),0.997639,0.823394
5,<class 'sklearn.feature_extraction.text.TfidfV...,"(1, 3)",195188,LogisticRegressionCV(),0.995382,0.831422
6,<class 'sklearn.feature_extraction.text.CountV...,"(1, 4)",309212,LogisticRegressionCV(),0.998916,0.823394
7,<class 'sklearn.feature_extraction.text.TfidfV...,"(1, 4)",309212,LogisticRegressionCV(),0.996436,0.837156
8,<class 'sklearn.feature_extraction.text.CountV...,"(1, 5)",419166,LogisticRegressionCV(),0.999079,0.817661
9,<class 'sklearn.feature_extraction.text.TfidfV...,"(1, 5)",419166,LogisticRegressionCV(),0.996971,0.829128


In [14]:
df['model'].apply(lambda x: x.C_)

0    [21.54434690031882]
1    [21.54434690031882]
2    [2.782559402207126]
3    [21.54434690031882]
4    [2.782559402207126]
5    [21.54434690031882]
6    [2.782559402207126]
7    [21.54434690031882]
8    [2.782559402207126]
9    [21.54434690031882]
Name: model, dtype: object

In [13]:
df.model.iloc[0].Cs_

array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04])