In [24]:
%load_ext autoreload
%autoreload 2
from transformers import BertModel, DistilBertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import datasets
import numpy as np
import os.path
from spacy.lang.en import English
from datasets import load_from_disk
import pickle as pkl
from sklearn.linear_model import LogisticRegressionCV
from collections import defaultdict
from copy import deepcopy
from tqdm import tqdm
import dvu
dvu.set_style()
import pandas as pd
from os.path import join as oj
import string
import data
import matplotlib.pyplot as plt
import seaborn as sns
import config
pd.set_option('display.max_rows', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Compute embeddings for sst2 using bert finetuned model.

In [25]:
class A:
    checkpoint = 'textattack/bert-base-uncased-SST-2'
    dataset = 'sst2'
    padding = True
args = A()

# data
dataset, args = data.process_data_and_args(args)

# countvectorizer
tok_simp = English().tokenizer
tokenizer_func = lambda x: [str(x) for x in tok_simp(x)] 
v = CountVectorizer(tokenizer=tokenizer_func)
v.fit(dataset['train']['sentence'])
words = sorted(list(v.vocabulary_.keys()))

"""Note that vocab is already based on words being sorted
remap_idxs = {
    i: v.vocabulary_[k] for i, k in enumerate(words)
}
for i in range(len(remap_idxs)):
    assert remap_idxs[i] == i
"""

Using custom data configuration default
Reusing dataset sst2 (/tmp/.xdg_cache_vision/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

# compute embeddings

In [39]:
# set up model
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint) # for actually passing things to the model
model = BertModel.from_pretrained(args.checkpoint)

tokens = tokenizer(words, padding=args.padding, truncation=True, return_tensors="pt")
output = model(**tokens) # this takes a while....
embs = output['pooler_output'].cpu().detach().numpy()
pkl.dump(embs, open(oj(config.misc_dir, 'word_embs_sst_train.pkl'), 'wb'))
pkl.dump(words, open(oj(config.misc_dir, 'word_list_sst_train.pkl'), 'wb'))

# look at unigram embeddings

In [None]:
embs = pkl.load(open(oj(config.misc_dir, 'word_embs_sst_train.pkl'), 'rb'))
words = np.array(pkl.load(open(oj(config.misc_dir, 'word_list_sst_train.pkl'), 'rb')))

# countvec coefs
matrix = v.transform(dataset['train']['sentence'])
tot_counts = pd.DataFrame(matrix.sum(axis=0), columns=v.get_feature_names())
m = LogisticRegressionCV()
m.fit(matrix, dataset['train']['label'])
coef = m.coef_.flatten() # note -- coef has not been mapped to same idxs as words

# make df
df = pd.DataFrame.from_dict({
    'coef': coef,
    'tot_counts': tot_counts.values.squeeze(),
    'unigram': words,
})
# df.to_csv(oj(config.misc_dir, 'df_unigram_sst.csv'))

In [274]:
def get_idxs(unigrams, tot_counts, coef, percentile=99.5):
    idxs_punc = np.array(list(map(lambda s: all(c.isdigit() or c in string.punctuation for c in s),
                  unigrams)))
    idxs_count_large = tot_counts > np.percentile(tot_counts, percentile)

    cs = np.abs(coef).flatten()
    idxs_pred = cs >= np.percentile(cs, percentile)

    idxs = (idxs_pred | idxs_count_large) & ~is_punc
    return idxs


idxs = get_idxs(df['unigram'], df['tot_counts'].values, df['coef'].values, percentile=99.5)
es = pd.DataFrame(embs[idxs].T, columns=df['unigram'].values[idxs])
sims = es.corr()

In [None]:
# def coef_colors(coef):
#     if coef >= 0:
#         return 'green'
#     else:
#         return 'purple'

plt.figure(figsize=(12, 12))
vabs = np.max(np.abs(sims))
cm = sns.diverging_palette(10, 240, as_cmap=True)
cg = sns.clustermap(sims, cmap=cm, center=0.0, dendrogram_ratio=0.01,
                    cbar_pos=(0.7, 0.7, 0.05, 0.15),
                    cbar_kws={'label': 'Correlation'},
#                     row_colors=list(map(coef_colors, coef[idxs])),
#                     row_colors=list(map(cm, m.coef_.flatten()[idxs])),                    
#                     row_colors=list(map(cm, np.log(tot_counts[idxs]) / max(np.log(tot_counts[idxs])))),                      
#                     yticklabels=3 # how often to plot yticklabels
)

cg.ax_row_dendrogram.set_visible(False)
cg.ax_col_dendrogram.set_visible(False)

# mask
mask = np.triu(np.ones_like(sims))
values = cg.ax_heatmap.collections[0].get_array().reshape(sims.shape)
new_values = np.ma.array(values, mask=mask)
cg.ax_heatmap.collections[0].set_array(new_values)
cg.ax_heatmap.yaxis.set_ticks_position("left")


xaxis = cg.ax_heatmap.get_xaxis()
xticklabels = xaxis.get_majorticklabels()
# plt.tight_layout()
cg.savefig('results/unigrams_sim.pdf')
# plt.tight_layout()

# construct bigram embeddings

Here we will compare coefs of common bigrams to the sum of the coefs of their unigrams

In [None]:
v2 = CountVectorizer(tokenizer=tokenizer_func, ngram_range=(2, 2))
v2.fit(dataset['train']['sentence'])

In [None]:
# countvec coefs
matrix2 = v2.transform(dataset['train']['sentence'])
tot_counts2 = pd.DataFrame(matrix2.sum(axis=0), columns=v2.get_feature_names())
m2 = LogisticRegressionCV()
m2.fit(matrix2, dataset['train']['label'])
coef2 = m2.coef_.flatten() # note -- coef has not been mapped to same idxs as words

df2 = pd.DataFrame.from_dict({
    'coef': coef2,
    'tot_counts': tot_counts2.values.squeeze(),
#     'unigram': words,
    'bigram': sorted(list(v2.vocabulary_.keys()))
})
# df2.to_csv(oj(config.misc_dir, 'df_bigram_sst.csv'))

In [330]:
def find_unigram_scores(bigram):
    unigram1, unigram2 = bigram.split(' ')
    unigram1_score = df.loc[df['unigram'] == unigram1, 'coef'].iloc[0]
    unigram2_score = df.loc[df['unigram'] == unigram2, 'coef'].iloc[0]
    return unigram1, unigram2, unigram1_score, unigram2_score

out = list(zip(*df2['bigram'].map(find_unigram_scores)))

In [331]:
for (i, c) in enumerate(['unigram1', 'unigram2', 'coef1', 'coef2']):
    df2[c] = out[i]

In [333]:
# df2.to_csv(oj(config.misc_dir, 'df_bigram_sst.csv'))

In [332]:
df2.head()

Unnamed: 0,coef,tot_counts,bigram,unigram1,unigram2,coef1,coef2
0,1.791601,15,! '',!,'',1.370037,0.115941
1,-0.701988,4,! ),!,),1.370037,-0.119441
2,-0.189479,9,! ?,!,?,1.370037,-2.306672
3,-0.612194,6,# 3,#,3,-2.923652,-2.064273
4,0.6009,7,# 9,#,9,-2.923652,1.90289


In [334]:
df2['interaction_score'] = abs(df2['coef'] - (df2['coef1'] + df2['coef2']))

In [337]:
df2.columns

Index(['coef', 'tot_counts', 'bigram', 'unigram1', 'unigram2', 'coef1',
       'coef2', 'interaction_score'],
      dtype='object')

In [338]:
df2 = df2[['bigram', 'interaction_score', 'tot_counts',
           'coef', 'coef1', 'coef2', 'unigram1', 'unigram2', ]]

In [341]:
df2.sort_values('interaction_score', ascending=False).round(2).head(100)

Unnamed: 0,bigram,interaction_score,tot_counts,coef,coef1,coef2,unigram1,unigram2
68919,vividly captures,14.78,2,0.01,7.8,6.99,vividly,captures
44950,not bad,13.25,7,6.44,-3.19,-3.62,not,bad
57713,spielberg calls,13.05,7,0.08,3.82,9.31,spielberg,calls
43880,n't lost,13.02,14,4.86,-3.63,-4.53,n't,lost
44408,never lacks,12.72,4,4.27,-1.1,-7.36,never,lacks
34889,incoherent mess,12.48,3,-0.56,-3.9,-9.14,incoherent,mess
14407,banal bore,12.44,4,-2.45,-4.88,-10.01,banal,bore
43527,muddled limp,12.16,3,-0.57,-8.33,-4.4,muddled,limp
44415,never loses,11.79,2,3.47,-1.1,-7.23,never,loses
54051,"sappiness ,",11.69,2,3.44,-8.43,0.18,sappiness,","


2. (later) bigrams of common unigrams not seen during training -- maybe do this by construction