In [1]:
%load_ext autoreload
%autoreload 2

import time
import json
import fasttext
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.spatial.distance import cdist, cosine, euclidean
from scipy.stats import ttest_ind, ttest_1samp
from sklearn.decomposition import PCA, FastICA
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from gpu_utils import restrict_GPU_pytorch

In [2]:
restrict_GPU_pytorch('2')

Using GPU:2


In [None]:
# Returns a list of predictions (prob of moderation) for a list of examples and a given model. 
# allExamples: list of strings 
# model: fastText model object
def getModelPredsHelper(allExamples, model):
    exampleList = [x.replace('\n', ' ') for x in allExamples]
    preds = model.predict(exampleList)
    preds_int = np.array([1 if 'positive' in p[0] else 0 for p in preds[0]])
    preds_prob = np.array([p[0] for p in preds[1]])
    probs = np.array([1 - p if preds_int[i] == 0 else p for (i,p) in enumerate(preds_prob)])
    probs = [np.round(p,3) for p in probs]
    return probs

# vec: query vector (list of floats)
# all_vecs: pool of all vectors from which neighbors are retrieved (list of list of floats)
# comments: comments corresponding to the vectors in all_vecs (list of strings)
# n: number of neighbors to return (int)
# return_idx: if true, returns indices of nearest neighbors instead of the actual comments (bool)
def getKNNFromVector(vec, all_vecs, comments, n=30, return_idx=False):
    dist_vec = cdist(vec, all_vecs, 'cosine')
    top_vec_idx = np.argsort(dist_vec[0])[1:n+1]
    if return_idx: return top_vec_idx
    top_comments = np.array(comments)[top_vec_idx]
    top_comments = [c for c in top_comments]
    return top_comments

In [3]:
DATA_FPATH = '../data/reddit/'
MODEL_FPATH = '../models/reddit/'
USE_PATH = '../Dev/tf_hub/universal-sentence-encoder_4/'

In [4]:
subreddits = ['askscience', 'conspiracy', 'funny', 'hillaryclinton', 'history']

## Load Data

In [5]:
all_comments = pd.read_csv(DATA_FPATH + 'all_comments_df')
embed = hub.load(USE_PATH);
all_comments.head()

Unnamed: 0.1,Unnamed: 0,body,subreddit,moderated
0,0,You can tell this is fake because it shows a b...,funny,1
1,1,They're purple because she's dead.\n\nLol sorr...,hillaryclinton,1
2,2,The fat lady is singing.,funny,1
3,3,Good thing volks never break down. XD Shitty k...,funny,1
4,5,I'm dreaming of a bright christmas.\n,funny,1


In [6]:
sub = 'funny'
comments = all_comments[all_comments.subreddit == sub].body.values
labels = all_comments[all_comments.subreddit == sub].moderated.values
vecs = np.array(embed(comments))
subreddit_model = fasttext.load_model(MODEL_FPATH + "%s_model.bin" % sub)



## Nearest Neighbor Search for Example Set Generation

In [None]:
seed = """
poor snowflake do you need a safe space
"""

top_vec_idx = getKNNFromVector(embed([seed]), vecs, comments, n=50, return_idx=True)
knn_vecs = vecs[top_vec_idx]
knn_comments = comments[top_vec_idx]
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(knn_vecs)

for cluster in range(num_clusters):
    cluster_comments = knn_comments[np.where(kmeans.labels_ == cluster)]
    
    print('===================== CLUSTER %i =========================' % cluster)
    for c in cluster_comments:
        print(c, end='\n------------------------\n')
    print("\n\n")

### Next steps: Annie

Example search & generation has three main steps (though they may be intertwined in some ways): 1) getting similar examples, 2) clustering or organizing them in some way 3) visualizing the result.  Right now we're doing 1) with euclidean distance in embedding space of the USE, 2) with k-means clustering, and 3) with just printing out the examples. 

We can think about ways to improve each of these parts.  To start, let's explore 1).  Right now we are getting similar examples with the USE (the model we load from TF Hub).  How does this compare to a different embedding model?  Here are some we can try: 
* BERT: https://huggingface.co/bert-base-uncased
* RoBERTa: https://huggingface.co/roberta-base
* XLNet: https://huggingface.co/xlnet-base-cased

Later, we could also think about fine-tuning some of these embeddings to be better suited to our task/data.  Sample code for loading BERT and getting embeddings for a sample of sentences: 

### To do: 
* modify `getKNNFromVector` to take a particular embedding name (e.g., 'BERT') and compute distances in that embedding space. 
* for a handful of seed sentences, get the nearest neighbors and print them out as above in each of the different embedding spaces. Qualitatively note differences you notice among what is returned as similar. Do some seem better or worse?  Are there noticeable differences? 
* for an easy 2 or 3D projection, you can try loading the data into the embedding projector: https://projector.tensorflow.org/  You may want to just do a particular example and its 100 nearest neighbors or something (rather than all the data). You'll have to save the embeddings and sentences as TSV files and then load them in. 

In [7]:
import torch
# from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizerFast, DistilBertModel

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = BertModel.from_pretrained('bert-base-uncased')
# bert_model.cuda()

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

model.eval()
model.cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [8]:
torch.cuda.is_available()

True

In [9]:
model.device

device(type='cuda', index=0)

In [None]:
batch_sentences = ["Hello, my dog is cute", "another sample sentence"]
tokenized_sentences = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")['input_ids']
tokenized_sentences = tokenized_sentences.cuda()

In [None]:
tokenized_sentences.shape

In [None]:
output = model(tokenized_sentences)['last_hidden_state'].cpu().detach().numpy()
output

In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader

class CommentsDataset(Dataset):
    def __init__(self, comments, labels, tokenizerfn):
        self.comments = comments
        self.labels = labels
        self.tokenizerfn = lambda comments: tokenizerfn(comments, padding=True, truncation=True, return_tensors="pt")['input_ids']
#         self.tokenizer = tokenizerfn
#         self.tokenizerfn = lambda comments: self.tokenizer(comments, padding=True, truncation=True, return_tensors="pt")['input_ids']

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        return self.tokenizerfn(comments[idx]) #, self.labels[idx]


In [None]:
from transformers import BertTokenizer # , BertModel
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
dataset = CommentsDataset(comments, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
# next(iter(dataloader))
dataset[1].shape

#### BERT embedding

In [10]:
def print_gpu_obj():
    import gc
    GPU_count = 0
    Pinned_count = 0
    for tracked_object in gc.get_objects():
        if torch.is_tensor(tracked_object):
            if tracked_object.is_cuda:
                GPU_count+=1
            if tracked_object.is_pinned():
                Pinned_count+=1
            

    print("There are {} cuda objects".format(GPU_count))
    print("There are {} pinned objects".format(Pinned_count))

In [11]:
t0 = time.time()
for i in range(len(comments)):
    tokenized_comments = tokenizer(list(comments[i]), padding=True, truncation=True, return_tensors="pt")['input_ids'].cuda()
    output = model(tokenized_comments)['last_hidden_state'] #.cpu().detach().numpy()
    del tokenized_comments
    if i % 100 == 0:
        print(f'{i} of 34347')
        print_gpu_obj()
t1 = time.time()
print(f'{t1 - t0} seconds')

0 of 34347
There are 101 cuda objects
There are 0 pinned objects




100 of 34347
There are 101 cuda objects
There are 0 pinned objects
200 of 34347
There are 101 cuda objects
There are 0 pinned objects
300 of 34347
There are 101 cuda objects
There are 0 pinned objects
400 of 34347
There are 101 cuda objects
There are 0 pinned objects
500 of 34347
There are 101 cuda objects
There are 0 pinned objects
600 of 34347
There are 101 cuda objects
There are 0 pinned objects
700 of 34347
There are 101 cuda objects
There are 0 pinned objects


RuntimeError: CUDA out of memory. Tried to allocate 266.00 MiB (GPU 0; 11.91 GiB total capacity; 10.19 GiB already allocated; 117.94 MiB free; 317.25 MiB cached)

In [None]:
tokenized_comments = tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt")['input_ids']
tokenized_comments = tokenized_comments.cuda()
tokenized_comments.shape

In [None]:
bert_output = model(tokenized_comments)['last_hidden_state'].cpu().detach().numpy()

In [None]:
torch.cuda.empty_cache()

In [None]:
import time
t0 = time.time()
for i in range(len(comments)):
    tokenized_comment = tokenizer(list(comments[i]), padding=True, truncation=True, return_tensors="pt")['input_ids'].cuda()
    output = bert_model(tokenized_comment)['pooler_output'].detach()
    if i % 1000 == 0:
        print(f'{i} of 34347')
t1 = time.time()
print(f'{t1 - t0} seconds')

In [None]:
bert_output = bert_model(tokenized_comment)['pooler_output'].detach()

In [None]:
bert_output.shape

In [None]:
num_iterations = 1000
size = 34347//num_iterations
for i in range(num_iterations):
    tokenized_comments = tokenizer(list(comments[size*i : min(size*(i+1), 34347)]), padding=True, truncation=True, return_tensors="pt")['input_ids'].cuda()
    bert_output = bert_model(tokenized_comments)['pooler_output'].detach().shape()
    print(bert_output.shape)

In [None]:
def comments_bert_model(comments, batch_size):
    comments_output = None
    num_iterations = 1+len(comments)//batch_size
    for i in range(num_iterations):
        small_sample_comments = list(comments[batch_size*i : min(batch_size*(i+1), len(comments))])
        tokenized_small_sample_comments = tokenizer(small_sample_comments, padding=True, truncation=True, return_tensors="pt")['input_ids']
        tokenized_small_sample_comments.cuda()
        small_sample_comments_output = model(tokenized_small_sample_comments)['pooler_output'].cpu().detach().numpy()
        print(f'{i+1}/{num_iterations} iterations complete')
        if comments_output is None:
            comments_output = small_sample_comments_output
        else:
            comments_output = np.concatenate((comments_output, small_sample_comments_output), axis=0)
    return comments_output

In [None]:
bert_comments_output = comments_bert_model(comments, 50)

In [None]:
bert_model.cuda()

In [None]:
tokenized_comments = tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt")['input_ids']
output = bert_model(tokenized_comments) #['pooler_output'].detach().numpy()

output

In [None]:
def comments_bert_model_universal_sentence_encoder(comments, batch_size):
    comments_output = None
    num_iterations = 1+len(comments)//batch_size
    for i in range(num_iterations):
        small_sample_comments = list(comments[batch_size*i : min(batch_size*(i+1), len(comments))])
        tokenized_small_sample_comments = torch.from_numpy(embed(small_sample_comments).numpy()).long()
        small_sample_comments_output = bert_model(tokenized_small_sample_comments)['pooler_output'].detach().numpy()
        print(f'{i+1}/{num_iterations} iterations complete')
        if comments_output is None:
            comments_output = small_sample_comments_output
        else:
            comments_output = np.concatenate((comments_output, small_sample_comments_output), axis=0)
    return comments_output

In [None]:
universal_comments_output = comments_bert_model_universal_sentence_encoder(comments, 50)

In [None]:
outputs = {'BERT': bert_comments_output, 'universal-sentence-encoder_4': embed(comments)}

In [None]:
# seeds: query comments (list of strings)
# comments: comments to compare to (list of strings)
# embedding: string corresponding to a particular embedding name (e.g. 'BERT')
# n: number of neighbors to return (int)
# return_idx: if true, returns indices of nearest neighbors instead of the actual comments (bool)
def getKNNFromVector(seeds, comments, embedding, n=30, return_idx=False):
    comments_output = outputs[embedding]
    if embedding == 'BERT':
        tokenized_seeds = tokenizer(seeds, padding=True, truncation=True, return_tensors="pt")['input_ids']
    elif embedding == 'universal-sentence-encoder_4':
        tokenized_seeds = embed(seeds)
    else:
        return "invalid embedding provided"
    
    seeds_output = bert_model(tokenized_seeds)['pooler_output'].detach().numpy()
    dist_vec = cdist(seeds_output, comments_output, 'cosine')
    top_vec_idx = np.argsort(dist_vec[0])
    top_comments = np.array(comments[:batch_size*num_iterations])[top_vec_idx]
    top_n_comments = [c for c in top_comments[:n]]
    return top_n_comments

In [None]:
def compare_embeddings(seeds, comments):
    if not isinstance(seeds, list):
        print("Must input a list of seeds.")
        return
    else:
        if len(seeds) == 1:
            print(f'Testing on the seed "{decode(seeds[0])}"')
        else:
            s = ", ".join([f'"{decode(seed)}"' for seed in seeds])
            print(f'Testing on the seeds {s}')
    
    print('\nResults with universal_sentence_encoder_4:')
    try:
        embed_results = getKNNFromVector(seeds, comments, n=10, return_idx=False)
        for (i, x) in enumerate(embed_results):
            print(f'{i+1}. "{decode(x)}"')
    except:
        print("something failed")
    
    print('\nResults with BERT:')
    try:
        bert_results = getKNNFromVector(seeds, comments, embedding="BERT", n=10, return_idx=False)
        for (i,x) in enumerate(bert_results):
            print(f'{i+1}. "{decode(x)}"')
    except:
        print("something failed")

In [None]:
seeds = ["hello"]
getKNNFromVector(seeds, list(comments), embedding="BERT", n=10, return_idx=False)

In [None]:
seeds = ["hello"]
compare_embeddings(seeds, comments)

In [None]:
seeds = ["\npoor snowflake do you need a safe space\n"]
compare_embeddings(seeds)

In [None]:
seeds = ["Hello, my dog is cute", "another sample sentence"]
compare_embeddings(seeds)

## Testing

In [None]:
def quantify_shift(shift_target, example_list, example_list_2, model, labels = None, labels_2 = None, list_names=["list 1", "list 2"]): 
    if shift_target == "predictions":
        preds_1 = getModelPredsHelper(example_list, model)
        preds_2 = getModelPredsHelper(example_list_2, model)
        ttest_result = ttest_ind(preds_1, preds_2)
        if ttest_result.pvalue >= 0.05: 
            print("Predictions are not significantly different.")
        else: 
            operator = "higher" if ttest_result.statistic > 0 else "lower"
            print("P(moderated) for %s is *%s* than for %s (pval = %.3f)" % (list_names[0], operator, list_names[1], ttest_result.pvalue))
            
            
    elif shift_target == "representation":
        vecs_1 = [model.get_sentence_vector(ex) for ex in example_list]
        vecs_2 = [model.get_sentence_vector(ex) for ex in example_list_2]
        intergroup_diffs = cdist(vecs_1, vecs_2).flatten()
        intragroup_diffs = np.concatenate((cdist(vecs_1, vecs_1).flatten(), cdist(vecs_2, vecs_2).flatten()))
        ttest_result = ttest_ind(intergroup_diffs, intragroup_diffs)
        if ttest_result.statistic > 0 and ttest_result.pvalue < 0.05: 
            print("Representations for %s are significantly different from %s (pval = %f)." % (list_names[0], list_names[1], ttest_result.pvalue))
        else: 
            print("Representations for %s and %s are not significantly different." % (list_names[0], list_names[1]))
            
            
    elif shift_target == "performance":
        preds_1 = getModelPredsHelper(example_list, model)
        preds_2 = getModelPredsHelper(example_list_2, model)
        perf_1 = np.array([np.round(preds_1[i]) == labels[i] for i in range(len(preds_1))]).astype(int)
        perf_2 = np.array([np.round(preds_2[i]) == labels_2[i] for i in range(len(preds_2))]).astype(int)
        ttest_result = ttest_ind(perf_1, perf_2)
        if ttest_result.pvalue >= 0.05: 
            print("Model performance on %s and %s is not significantly different." % (list_names[0], list_names[1]))
        else: 
            operator = "higher" if ttest_result.statistic > 0 else "lower"
            print("Model performance on %s is *%s* than for %s (pval = %.3f)" % (list_names[0], operator, list_names[1], ttest_result.pvalue))