In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertModel, BertConfig, DistilBertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np
import pandas as pd
import data
from copy import deepcopy
from spacy.lang.en import English
from collections import defaultdict

2022-07-07 12:14:08.340302: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-07 12:14:08.340347: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  return torch._C._cuda_getDeviceCount() > 0


# look at some dsets

In [None]:
dataset = datasets.load_dataset('imdb')
dataset

In [3]:
dataset['validation'] = dataset['test']

In [25]:
# del dataset['test']
# dataset['train'] = dataset['train'].select(range(1000))

In [None]:
np.unique(dataset['test']['label']) # weird error -- all test labels seem to be -1

**emotion**

In [None]:
dataset = datasets.load_dataset('emotion')
dataset

In [None]:
dataset['train']['label'][-200:]

In [None]:
dataset['train']['text']

**rotten_tomatoes**

In [None]:
dataset = datasets.load_dataset('rotten_tomatoes')
dataset

**tweet_eval**

In [None]:
dataset = datasets.load_dataset('tweet_eval', 'hate')
dataset

In [None]:
dataset['test']['label']

In [None]:
dataset = datasets.load_dataset('financial_phrasebank', 'sentences_allagree')

# datasets overview

In [3]:
tok_simp = English().tokenizer # init here to speedup call
simple_tokenizer = lambda x: [str(x) for x in tok_simp(x)] 
ds = defaultdict(list)
class Args:
    ...
    
args = Args()
args.dataset = ''
ks = ['emotion', 'rotten_tomatoes', 'sst2', 'tweet_eval']
for k in ks:
    args.dataset = k
    d, args = data.process_data_and_args(args)
    text = d['train'][args.dataset_key_text]
    ds['n_train'].append(len(text))
    
    
    counts = np.unique(d['train']['label'], return_counts=True)[1]
    ds['imbalance'].append(max(counts) / sum(counts))
    
    ds['num_classes'].append(counts.size)
    
    text_val = d['validation'][args.dataset_key_text]
    ds['n_val'].append(len(text_val))    
    
    v = CountVectorizer(tokenizer=simple_tokenizer)
    v.fit(text)
    ds['n_tokens'].append(len(v.vocabulary_))
    
    v = CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(2, 2))
    v.fit(text)
    ds['n_bigrams'].append(len(v.vocabulary_))
    
    v = CountVectorizer(tokenizer=simple_tokenizer, ngram_range=(3, 3))
    v.fit(text)
    ds['n_trigrams'].append(len(v.vocabulary_))    

Using custom data configuration default
Reusing dataset emotion (/tmp/.xdg_cache_vision/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default
Reusing dataset rotten_tomatoes (/tmp/.xdg_cache_vision/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default
Reusing dataset sst2 (/tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset tweet_eval (/tmp/.xdg_cache_vision/huggingface/datasets/tweet_eval/hate/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
df = pd.DataFrame.from_dict(ds)
df.index = ks
df
df.to_csv('results/datasets_ovw.csv')

In [6]:
df = pd.read_csv('results/datasets_ovw.csv', index_col=0)

In [12]:
def prep_for_printing(df):
    df = df.sort_values('n_train')
    df['num_classes'] = df.pop('num_classes') # move imbalance to end
    df['imbalance'] = df.pop('imbalance') # move imbalance to end
    df = df.infer_objects()
    return df.rename(
        columns=data.COLUMNS_RENAME_DICT,
        index=data.DSETS_RENAME_DICT,
    ).round(2)

pd.options.display.float_format = '{:,}'.format
print(prep_for_printing(df).to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
{} &  Samples (train) &  Samples (val) &  Unigrams &  Bigrams &  Trigrams &  Classes &  Majority class fraction \\
\midrule
Rotten tomatoes &             8530 &           1066 &     16631 &    93921 &    147426 &        2 &                      0.5 \\
Tweet (Hate)    &             9000 &           1000 &     18476 &   106277 &    171769 &        2 &                     0.58 \\
Emotion         &            16000 &           2000 &     15165 &   106201 &    201404 &        6 &                     0.34 \\
SST2            &            67349 &            872 &     13887 &    72501 &    108800 &        2 &                     0.56 \\
\bottomrule
\end{tabular}



# classification

In [75]:
# classifier = pipeline("sentiment-analysis", model='bert-base-uncased')
classifier = pipeline("sentiment-analysis") #, model='bert-base-uncased')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [78]:
N = 150
split = 'validation' # weird error -- all test labels seem to be -1
preds = classifier(dataset[split]['sentence'][:N])
M = {'POSITIVE': 1, 'NEGATIVE': 0}
preds = [M[p['label']] for p in preds]
# print(preds, dataset[split]['label'])
print('Acc', np.mean(np.array(preds) == np.array(dataset[split]['label'])[:N]).round(3))

Acc 0.907


**feature extraction**

In [122]:
feature_extractor = pipeline("feature-extraction",
                             model='distilbert-base-uncased-finetuned-sst-2-english')
feats_list  = feature_extractor(dataset[split]['sentence'][:N])
print([len(feats[x][0]) for x in range(10)])

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[12, 10, 24, 28, 13, 25, 8, 15, 26, 40, 23, 39, 25, 26, 29, 26, 21, 44, 14, 21, 41, 17, 10, 20, 18, 35, 35, 28, 19, 13, 35, 38, 10, 39, 27, 14, 31, 40, 27, 23, 25, 30, 26, 19, 21, 18, 23, 17, 14, 25]


By default, features with different length sequences have different-sized embeddings. To fix this, let's apply padding during embedding step.

In [136]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [157]:
sequences = dataset['train']['sentence'][:10]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print('tokens', tokens['input_ids'].shape)
output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
embs = output['pooler_output'].cpu().detach().numpy()
print('embeddings', embs.shape)

tokens torch.Size([10, 29])
embeddings (10, 768)


**try another model**

In [18]:
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# model = DistilBertModel.from_pretrained(checkpoint)
checkpoint = 'textattack/bert-base-uncased-SST-2'
model = BertModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/bert-base-uncased-SST-2 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
sequences = dataset['train']['sentence'][:100]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print('tokens', tokens['input_ids'].shape)
output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
# embs = output['pooler_output'].cpu().detach().numpy()
# print('embeddings', embs.shape)

tokens torch.Size([100, 51])


In [20]:
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [22]:
output['pooler_output'].shape

torch.Size([100, 768])

# custom feature extraction
let's map each sequence to the sum of features obtained by each token (tutorial [here](https://huggingface.co/course/chapter3/2?fw=pt)).

In [158]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [174]:
def featurize_function(example):
    tokens = tokenizer(example['sentence'], padding=True, truncation=True, return_tensors="pt")
    output = model(**tokens) # has two keys, 'last_hidden_state', 'pooler_output'
    embs = output['pooler_output'].cpu().detach().numpy()
    return {'embs': embs}

In [175]:
dset_small = dataset['validation'] #[:10]
dset_small = dset_small.filter(lambda x: len(x["sentence"]) < 20)
dset_small

Loading cached processed dataset at /tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5/cache-7876086904868ef1.arrow


Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 8
})

In [176]:
featurized_dataset = dset_small.map(featurize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [184]:
np.array(featurized_dataset['embs']).shape

(8, 768)