In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertModel, BertConfig, DistilBertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np
import pandas as pd
import data
from copy import deepcopy
from spacy.lang.en import English
from collections import defaultdict

# look at some dsets

In [4]:
import imodelsx.data

# datasets overview

In [27]:
tok_simp = English().tokenizer  # init here to speedup call
simple_tokenizer = lambda x: [str(x) for x in tok_simp(x)]
ds = defaultdict(list)
# ks = sorted(['emotion', 'financial_phrasebank', 'rotten_tomatoes', 'sst2', 'tweet_eval'])
ks = ["financial_phrasebank", 'emotion']

for k in ks:
    d, dataset_key_text = imodelsx.data.load_huggingface_dataset(
        dataset_name=k
    )

    text = d["train"][dataset_key_text]
    ds["n_train"].append(len(text))

    counts = np.unique(d["train"]["label"], return_counts=True)[1]
    ds["imbalance"].append(max(counts) / sum(counts))

    ds["num_classes"].append(counts.size)

    text_val = d["validation"][dataset_key_text]
    ds["n_val"].append(len(text_val))

    v = CountVectorizer(tokenizer=simple_tokenizer)
    v.fit(text)
    ds["n_tokens"].append(len(v.vocabulary_))
    # count unseen tokens in validation set
    ds["n_tokens_unseen"].append(
        len(
            set(v.vocabulary_.keys())
            - set(
                CountVectorizer(tokenizer=simple_tokenizer)
                .fit(text_val)
                .vocabulary_.keys()
            )
        )
    )

    v = CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(2, 2))
    v.fit(text)
    ds["n_bigrams"].append(len(v.vocabulary_))
    # count unseen bigrams in validation set
    ds["n_bigrams_unseen"].append(
        len(
            set(v.vocabulary_.keys())
            - set(
                CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(2, 2))
                .fit(text_val)
                .vocabulary_.keys()
            )
        )
    )

    v = CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(3, 3))
    v.fit(text)
    ds["n_trigrams"].append(len(v.vocabulary_))
    # count unseen trigrams in validation set
    ds["n_trigrams_unseen"].append(
        len(
            set(v.vocabulary_.keys())
            - set(
                CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(3, 3))
                .fit(text_val)
                .vocabulary_.keys()
            )
        )
    )



  0%|          | 0/3 [00:00<?, ?it/s]

In [28]:
df = pd.DataFrame.from_dict(ds)
df.index = ks
df
df.to_csv('../../results/datasets_ovw.csv')

In [29]:
df = pd.read_csv('../../results/datasets_ovw.csv', index_col=0)

In [30]:
df.T.round()

Unnamed: 0,financial_phrasebank,emotion
n_train,2313.0,16000.0
imbalance,1.0,0.0
num_classes,3.0,6.0
n_val,1140.0,2000.0
n_tokens,7169.0,15165.0
n_tokens_unseen,4260.0,11384.0
n_bigrams,28481.0,106201.0
n_bigrams_unseen,22878.0,95466.0
n_trigrams,39597.0,201404.0
n_trigrams_unseen,35920.0,193555.0


In [31]:
for k in ['n_tokens', 'n_bigrams', 'n_trigrams']:
    df[k + '_tot'] = df[k] + df[k + '_unseen']

In [34]:
df.T.round(0)

Unnamed: 0,financial_phrasebank,emotion
n_train,2313.0,16000.0
imbalance,1.0,0.0
num_classes,3.0,6.0
n_val,1140.0,2000.0
n_tokens,7169.0,15165.0
n_tokens_unseen,4260.0,11384.0
n_bigrams,28481.0,106201.0
n_bigrams_unseen,22878.0,95466.0
n_trigrams,39597.0,201404.0
n_trigrams_unseen,35920.0,193555.0


In [22]:
def prep_for_printing(df):
    df = df.sort_values('n_train')
    df['num_classes'] = df.pop('num_classes') # move imbalance to end
    df['imbalance'] = df.pop('imbalance') # move imbalance to end
    df = df.infer_objects()
    df = df.rename(
        columns=data.COLUMNS_RENAME_DICT,
        index=data.DSETS_RENAME_DICT,
    ).round(2).sort_index()
    return df

pd.options.display.float_format = '{:,}'.format
print(prep_for_printing(df).to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
{} &  Samples (train) &  Samples (val) &  Unigrams &  Bigrams &  Trigrams &  Classes &  Majority class fraction \\
\midrule
Emotion              &            16000 &           2000 &     15165 &   106201 &    201404 &        6 &                     0.34 \\
Financial phrasebank &             2313 &           1140 &      7169 &    28481 &     39597 &        3 &                     0.62 \\
Rotten tomatoes      &             8530 &           1066 &     16631 &    93921 &    147426 &        2 &                      0.5 \\
SST2                 &            67349 &            872 &     13887 &    72501 &    108800 &        2 &                     0.56 \\
Tweet (Hate)         &             9000 &           1000 &     18476 &   106277 &    171769 &        2 &                     0.58 \\
\bottomrule
\end{tabular}



# classification

In [75]:
# classifier = pipeline("sentiment-analysis", model='bert-base-uncased')
classifier = pipeline("sentiment-analysis") #, model='bert-base-uncased')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [78]:
N = 150
split = 'validation' # weird error -- all test labels seem to be -1
preds = classifier(dataset[split]['sentence'][:N])
M = {'POSITIVE': 1, 'NEGATIVE': 0}
preds = [M[p['label']] for p in preds]
# print(preds, dataset[split]['label'])
print('Acc', np.mean(np.array(preds) == np.array(dataset[split]['label'])[:N]).round(3))

Acc 0.907


**feature extraction**

In [122]:
feature_extractor = pipeline("feature-extraction",
                             model='distilbert-base-uncased-finetuned-sst-2-english')
feats_list  = feature_extractor(dataset[split]['sentence'][:N])
print([len(feats[x][0]) for x in range(10)])

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[12, 10, 24, 28, 13, 25, 8, 15, 26, 40, 23, 39, 25, 26, 29, 26, 21, 44, 14, 21, 41, 17, 10, 20, 18, 35, 35, 28, 19, 13, 35, 38, 10, 39, 27, 14, 31, 40, 27, 23, 25, 30, 26, 19, 21, 18, 23, 17, 14, 25]


By default, features with different length sequences have different-sized embeddings. To fix this, let's apply padding during embedding step.

In [136]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [157]:
sequences = dataset['train']['sentence'][:10]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print('tokens', tokens['input_ids'].shape)
output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
embs = output['pooler_output'].cpu().detach().numpy()
print('embeddings', embs.shape)

tokens torch.Size([10, 29])
embeddings (10, 768)


**try another model**

In [18]:
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# model = DistilBertModel.from_pretrained(checkpoint)
checkpoint = 'textattack/bert-base-uncased-SST-2'
model = BertModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/bert-base-uncased-SST-2 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
sequences = dataset['train']['sentence'][:100]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print('tokens', tokens['input_ids'].shape)
output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
# embs = output['pooler_output'].cpu().detach().numpy()
# print('embeddings', embs.shape)

tokens torch.Size([100, 51])


In [20]:
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [22]:
output['pooler_output'].shape

torch.Size([100, 768])

# custom feature extraction
let's map each sequence to the sum of features obtained by each token (tutorial [here](https://huggingface.co/course/chapter3/2?fw=pt)).

In [158]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [174]:
def featurize_function(example):
    tokens = tokenizer(example['sentence'], padding=True, truncation=True, return_tensors="pt")
    output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
    embs = output['pooler_output'].cpu().detach().numpy()
    return {'embs': embs}

In [175]:
dset_small = dataset['validation'] #[:10]
dset_small = dset_small.filter(lambda x: len(x["sentence"]) < 20)
dset_small

Loading cached processed dataset at /tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5/cache-7876086904868ef1.arrow


Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 8
})

In [176]:
featurized_dataset = dset_small.map(featurize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [184]:
np.array(featurized_dataset['embs']).shape

(8, 768)