In [1]:
%load_ext autoreload
%autoreload 2

import time
import json
import torch
import fasttext
import numpy as np
import pandas as pd
import tensorflow_hub as hub 
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.spatial.distance import cdist, cosine, euclidean
from scipy.stats import ttest_ind, ttest_1samp
from sklearn.decomposition import PCA, FastICA
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from gpu_utils import restrict_GPU_pytorch

In [2]:
restrict_GPU_pytorch('3')

Using GPU:3


In [3]:
DATA_FPATH = '../data/reddit/'
MODEL_FPATH = '../models/reddit/'
USE_PATH = '../Dev/tf_hub/universal-sentence-encoder_4/'

In [4]:
subreddits = ['askscience', 'conspiracy', 'funny', 'hillaryclinton', 'history']

## Load Data

In [5]:
all_comments = pd.read_csv(DATA_FPATH + 'all_comments_df')
all_comments.head()

Unnamed: 0.1,Unnamed: 0,body,subreddit,moderated
0,0,You can tell this is fake because it shows a b...,funny,1
1,1,They're purple because she's dead.\n\nLol sorr...,hillaryclinton,1
2,2,The fat lady is singing.,funny,1
3,3,Good thing volks never break down. XD Shitty k...,funny,1
4,5,I'm dreaming of a bright christmas.\n,funny,1


In [6]:
sub = 'funny'
comments = all_comments[all_comments.subreddit == sub].body.values
labels = all_comments[all_comments.subreddit == sub].moderated.values

### Next steps: Annie

Example search & generation has three main steps (though they may be intertwined in some ways): 1) getting similar examples, 2) clustering or organizing them in some way 3) visualizing the result.  Right now we're doing 1) with euclidean distance in embedding space of the USE, 2) with k-means clustering, and 3) with just printing out the examples. 

We can think about ways to improve each of these parts.  To start, let's explore 1).  Right now we are getting similar examples with the USE (the model we load from TF Hub).  How does this compare to a different embedding model?  Here are some we can try: 
* BERT: https://huggingface.co/bert-base-uncased
* RoBERTa: https://huggingface.co/roberta-base
* XLNet: https://huggingface.co/xlnet-base-cased

Later, we could also think about fine-tuning some of these embeddings to be better suited to our task/data.  Sample code for loading BERT and getting embeddings for a sample of sentences: 

### To do: 
* modify `getKNNFromVector` to take a particular embedding name (e.g., 'BERT') and compute distances in that embedding space. 
* for a handful of seed sentences, get the nearest neighbors and print them out as above in each of the different embedding spaces. Qualitatively note differences you notice among what is returned as similar. Do some seem better or worse?  Are there noticeable differences? 
* for an easy 2 or 3D projection, you can try loading the data into the embedding projector: https://projector.tensorflow.org/  You may want to just do a particular example and its 100 nearest neighbors or something (rather than all the data). You'll have to save the embeddings and sentences as TSV files and then load them in. 

#### BERT embedding

In [7]:
from transformers import DistilBertTokenizerFast, DistilBertModel

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

  utils.DeprecatedIn35,


In [None]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# running the above line causes the kernel to fail ^

In [None]:
model.cuda()

In [None]:
tokenized_comments = tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt")['input_ids']
tokenized_comments = tokenized_comments.cuda()

In [None]:
tokenized_comments.shape

In [None]:
bert_output = np.concatenate(tuple([model(tokenized_comments[i:i+1])['last_hidden_state'].cpu().detach().numpy() for i in tqdm(range(len(comments)))]), axis=0)

In [None]:
bert_output.shape

#### Universal sentence encoder

In [None]:
import tensorflow_hub as hub

In [None]:
embed = hub.load(USE_PATH)
universal_output = embed(comments)

In [None]:
universal_output.shape

#### RoBERTa

In [None]:
from transformers import RobertaTokenizer, RobertaModel

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.cuda()

In [None]:
encoded_input = tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt")['input_ids']
encoded_input = encoded_input.cuda()

In [None]:
roberta_output = np.concatenate(tuple([model(encoded_input[i:i+1])['last_hidden_state'].cpu().detach().numpy() for i in tqdm(range(len(comments)))]), axis=0)

In [None]:
roberta_output.shape

#### XLNet

In [None]:
from transformers import XLNetTokenizer, XLNetModel

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-uncased')

In [None]:
type(XLNetTokenizer)

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')
model.cuda()

In [None]:
type(tokenizer)

In [None]:
xlnet_inputs = tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt")['input_ids']
xlnet_inputs = xlnet_inputs.cuda()

In [None]:
tokenizer

In [None]:
outputs = {'BERT': bert_comments_output, 'universal-sentence-encoder_4': embed(comments)}

## Testing

In [None]:
def quantify_shift(shift_target, example_list, example_list_2, model, labels = None, labels_2 = None, list_names=["list 1", "list 2"]): 
    if shift_target == "predictions":
        preds_1 = getModelPredsHelper(example_list, model)
        preds_2 = getModelPredsHelper(example_list_2, model)
        ttest_result = ttest_ind(preds_1, preds_2)
        if ttest_result.pvalue >= 0.05: 
            print("Predictions are not significantly different.")
        else: 
            operator = "higher" if ttest_result.statistic > 0 else "lower"
            print("P(moderated) for %s is *%s* than for %s (pval = %.3f)" % (list_names[0], operator, list_names[1], ttest_result.pvalue))
            
            
    elif shift_target == "representation":
        vecs_1 = [model.get_sentence_vector(ex) for ex in example_list]
        vecs_2 = [model.get_sentence_vector(ex) for ex in example_list_2]
        intergroup_diffs = cdist(vecs_1, vecs_2).flatten()
        intragroup_diffs = np.concatenate((cdist(vecs_1, vecs_1).flatten(), cdist(vecs_2, vecs_2).flatten()))
        ttest_result = ttest_ind(intergroup_diffs, intragroup_diffs)
        if ttest_result.statistic > 0 and ttest_result.pvalue < 0.05: 
            print("Representations for %s are significantly different from %s (pval = %f)." % (list_names[0], list_names[1], ttest_result.pvalue))
        else: 
            print("Representations for %s and %s are not significantly different." % (list_names[0], list_names[1]))
            
            
    elif shift_target == "performance":
        preds_1 = getModelPredsHelper(example_list, model)
        preds_2 = getModelPredsHelper(example_list_2, model)
        perf_1 = np.array([np.round(preds_1[i]) == labels[i] for i in range(len(preds_1))]).astype(int)
        perf_2 = np.array([np.round(preds_2[i]) == labels_2[i] for i in range(len(preds_2))]).astype(int)
        ttest_result = ttest_ind(perf_1, perf_2)
        if ttest_result.pvalue >= 0.05: 
            print("Model performance on %s and %s is not significantly different." % (list_names[0], list_names[1]))
        else: 
            operator = "higher" if ttest_result.statistic > 0 else "lower"
            print("Model performance on %s is *%s* than for %s (pval = %.3f)" % (list_names[0], operator, list_names[1], ttest_result.pvalue))