In [1]:
## KAGGLE ONLY
# from shutil import copyfile
# copyfile(src="../input/inputs/generate_dataloaders.py", dst="../working/generate_dataloaders.py")
# copyfile(src="../input/inputs/train_dataloader.p", dst="../working/train_dataloader.p")
# copyfile(src="../input/inputs/val_dataloader.p", dst="../working/val_dataloader.p")
# copyfile(src="../input/inputs/centroids_dataloader.p", dst="../working/ground_truth_dataloader.p")
# copyfile(src="../input/inputs/dictionary.p", dst="../working/dictionary.p")

In [23]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt
import pandas as pd
import random

from generate_dataloaders import *
from tqdm import tqdm_notebook as tqdm

import evaluation
import importlib
importlib.reload(evaluation)

<module 'evaluation' from '/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/evaluation.py'>

## Get Dataloaders

In [24]:
seed = 1029
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))

In [25]:
path = os.getcwd()
data_dir = path + '/'
data_dir = path +'/data/' #Uncomment for local system

#### *Verify filenames are consistent*

In [26]:
train_loader = pkl.load(open(data_dir + 'train_dataloader.p','rb'))
train_loader_labelled = pkl.load(open(data_dir + 'train_labeled_dataloader.p','rb'))
train_loader_unlabelled = pkl.load(open(data_dir + 'train_unlabeled_dataloader.p','rb'))
val_loader = pkl.load(open(data_dir + 'val_dataloader.p','rb'))

In [27]:
review_dict = pkl.load(open(data_dir + 'dictionary.p','rb'))

In [28]:
#%conda install pytorch torchvision -c pytorch
## if torch.__version__ is not 1.3.1, run this cell then restart kernel

In [29]:
print(torch.__version__)

1.3.1


## PRE TRAINED WORD EMBEDDINGS 

In [8]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float16')

In [9]:
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

In [10]:
def build_matrix(review_dict, embedding_index ,dim = 200):
#     embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(review_dict.tokens), dim))
    unknown_words = []
    
    for word, i in review_dict.ids.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [11]:
#glove_twitter = '../input/glove-global-vectors-for-word-representation/glove.twitter.27B.200d.txt' #Change loc for local system
glove_twitter = data_dir + 'glove.twitter.27B.200d.txt'

In [12]:
embedding_index = load_embeddings(glove_twitter)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
glove_embedding_index,unknown_words = build_matrix(review_dict, embedding_index)
del embedding_index

In [14]:
len(review_dict.tokens)

16256

In [15]:
len(unknown_words)

4428

In [16]:
# for word in unknown_words:
#     print(word)

## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [30]:
class neuralNetBow_glove(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, embedding_matrix, upweight=10, unfrozen=False):
        super(neuralNetBow_glove, self).__init__()
        self.embedding_matrix = embedding_matrix
        self.vocab_size = embedding_matrix.shape[0]
        self.embed_size = embedding_matrix.shape[1]
        
        self.embed = nn.Embedding(self.vocab_size, self.embed_size, padding_idx=0)
        self.embed.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        self.embed.weight.requires_grad = unfrozen
        
        self.upweight = upweight
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
        
        # upweight by flagged_index
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff

In [31]:
class KMeansCriterion(nn.Module):
    
    def __init__(self, lmbda):
        super().__init__()
        self.lmbda = lmbda
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.min(1)
        loss = self.lmbda * cluster_distances.sum()
        return loss, cluster_assignments

In [32]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

## Training Function

In [25]:
def train_model(model, centroids, criterion, optimizer, train_loader, valid_loader, num_epochs=10, path_to_save=None, print_every = 1000):

    train_losses=[]
    val_losses=[]
    num_gpus = torch.cuda.device_count()
    if num_gpus > 0:
        current_device = 'cuda'
    else:
        current_device = 'cpu'
    
    for epoch in range(num_epochs):
        print('{} | Epoch {}'.format(dt.datetime.now(), epoch))
        model.train()
        k, d = centroids.size()
        centroid_sums = torch.zeros_like(centroids).to(current_device)
        centroid_counts = torch.zeros(k).to(current_device)
        total_epoch_loss = 0

        # run one epoch of gradient descent on autoencoders wrt centroids
        for i, (tokens, labels, flagged_indices) in tqdm(enumerate(train_loader)):
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)

            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids.detach())

            # run update step
            optimizer.zero_grad()
            #cluster_loss.backward()
            optimizer.step()
            
            #Add loss to the epoch loss
            total_epoch_loss += cluster_loss.data

            # store centroid sums and counts in memory for later centering
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments, sentence_embed)

            if i % print_every == 0:
                losses = cluster_loss.data/len(tokens)
                print('Average training loss at batch ',i,': %.3f' % losses)
            
        total_epoch_loss /= len(train_loader.dataset)
        train_losses.append(total_epoch_loss)
        print('Average training loss after epoch ',epoch,': %.3f' % total_epoch_loss)
        
        # update centroids based on assignments from autoencoders
        centroids = centroid_sums / (centroid_counts[:, None] + 1).to(current_device)
        
        # calculate validation loss after every epoch
        total_validation_loss = 0
        for i, (tokens, labels, flagged_indices) in enumerate(valid_loader):
            model.eval()
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            #Add loss to the validation loss
            total_validation_loss += cluster_loss.data

        total_validation_loss /= len(valid_loader.dataset)
        val_losses.append(total_validation_loss)
        print('Average validation loss after epoch ',epoch,': %.3f' % total_validation_loss)
        
        if path_to_save == None:
            pass
        else:
            opts = {"embedding_matrix":model.embedding_matrix} #change options depending on model inputs required
            torch.save(model.state_dict(), path_to_save+'model_dict.pt')
            torch.save(centroids, path_to_save+'centroids')
            torch.save(train_losses, path_to_save+'train_losses')
            torch.save(val_losses, path_to_save+'val_losses')
            torch.save(opts, path_to_save+'opts')
            
        
    return model, centroids, train_losses, val_losses

In [17]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

model = neuralNetBow_glove(glove_embedding_index).to(current_device)

In [28]:
centroids = centroid_init(2, 200,train_loader_labelled, model, current_device)
criterion = KMeansCriterion(1).to(current_device)
optimizer = torch.optim.Adam(model.parameters(), 0.01, amsgrad=True)

In [29]:
centroids

tensor([[ 7.3920e-02,  4.7916e-02, -3.4403e-03,  6.2405e-02, -1.4993e-02,
          9.7727e-02,  3.1345e-01, -2.0820e-02, -5.8466e-02, -6.3985e-02,
         -2.5678e-02, -6.1849e-02, -4.1237e-01, -5.8385e-02, -2.4222e-02,
          4.8410e-02, -1.2295e-02,  2.9833e-02, -7.6221e-02, -2.1353e-02,
         -1.3972e-02,  7.9938e-03, -3.5308e-02,  1.9144e-02, -6.5173e-02,
          4.9148e-01,  2.8338e-03,  8.5547e-02,  1.0131e-01,  3.7258e-03,
         -2.4144e-02, -7.5183e-02, -4.1977e-02, -5.0204e-03,  3.2769e-02,
          5.0823e-02,  3.5903e-02, -2.0590e-02,  2.4465e-02,  1.4467e-02,
          2.2642e-01, -1.7756e-02,  7.1540e-02,  9.6466e-03,  1.8655e-02,
         -3.1804e-02,  9.3824e-02, -1.1422e-02, -6.5178e-02,  2.9992e-02,
          2.2362e-02,  3.6858e-02, -5.0271e-02, -1.5484e-02,  5.2415e-03,
          2.4776e-02, -3.1752e-02, -1.2649e-02, -2.4775e-02, -1.0467e-02,
          1.2805e-02,  3.1579e-02, -3.3153e-02, -2.3706e-02,  5.6861e-02,
          3.1290e-02, -3.6886e-02, -2.

In [30]:
current_device

'cpu'

In [31]:
review_dict.get_id("the")

41

In [32]:
torch.tensor([41])

tensor([41])

In [33]:
model.embed(torch.tensor([41]).to(current_device))

tensor([[ 4.9341e-01,  3.5693e-01,  6.6064e-01, -3.2990e-02,  2.4988e-01,
          2.5928e-01, -2.7176e-02,  6.8420e-02, -2.9053e-01, -4.5703e-01,
         -7.7942e-02,  3.2520e-01, -1.4854e+00, -6.7444e-02, -1.7029e-01,
         -9.2926e-03,  3.4619e-01, -1.1574e-02,  3.7964e-02,  4.5605e-01,
          8.0505e-02,  1.5308e-01, -1.5308e-01, -1.8811e-01, -1.8201e-01,
          8.7256e-01,  3.9795e-01,  4.0991e-01,  4.4971e-01, -1.9646e-03,
         -4.1138e-02, -4.7882e-02, -2.4048e-01, -8.6853e-02,  1.4183e-02,
         -2.3755e-01,  2.5171e-01,  2.8540e-01,  4.4507e-01, -4.9634e-01,
         -1.2708e-01, -1.7480e-01,  8.2214e-02,  4.5410e-02,  5.1709e-01,
          3.4546e-02, -8.5815e-02, -3.4912e-01,  5.2197e-01, -3.9502e-01,
          6.4148e-02, -4.2017e-01, -1.5942e-01,  1.8286e-01, -5.7892e-02,
         -1.9180e-02, -4.4556e-01,  3.1543e-01, -1.6101e-01, -9.2163e-02,
         -2.4963e-01, -1.3895e-03, -4.2651e-01, -1.7932e-01,  8.1665e-02,
          1.8323e-01, -3.2056e-01, -1.

In [34]:
path = os.getcwd()
model_folder= 'baseline_frozen_glove/'
model_dir = path + '/models/' + model_folder

In [35]:
baseline_model, baseline_centroids, baseline_train_losses, baseline_val_losses = train_model(model, centroids, criterion, optimizer, train_loader, val_loader, num_epochs=10, path_to_save=model_dir)


2019-11-21 16:12:00.616240 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.820
Average training loss at batch  1000 : 2.147
Average training loss at batch  2000 : 1.839
Average training loss at batch  3000 : 1.942

Average training loss after epoch  0 : 1.985
Average validation loss after epoch  0 : 1.642
2019-11-21 16:12:08.619744 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.572
Average training loss at batch  1000 : 1.364
Average training loss at batch  2000 : 1.769
Average training loss at batch  3000 : 1.390

Average training loss after epoch  1 : 1.649
Average validation loss after epoch  1 : 1.632
2019-11-21 16:12:14.698424 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.865
Average training loss at batch  1000 : 1.626
Average training loss at batch  2000 : 1.297
Average training loss at batch  3000 : 2.058

Average training loss after epoch  2 : 1.636
Average validation loss after epoch  2 : 1.632
2019-11-21 16:12:20.484568 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.970
Average training loss at batch  1000 : 1.627
Average training loss at batch  2000 : 1.615
Average training loss at batch  3000 : 1.881

Average training loss after epoch  3 : 1.635
Average validation loss after epoch  3 : 1.633
2019-11-21 16:12:26.715221 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.869
Average training loss at batch  1000 : 1.615
Average training loss at batch  2000 : 1.832
Average training loss at batch  3000 : 1.325

Average training loss after epoch  4 : 1.635
Average validation loss after epoch  4 : 1.633
2019-11-21 16:12:33.533144 | Epoch 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.296
Average training loss at batch  1000 : 1.575
Average training loss at batch  2000 : 1.594
Average training loss at batch  3000 : 1.498

Average training loss after epoch  5 : 1.635
Average validation loss after epoch  5 : 1.633
2019-11-21 16:12:39.381253 | Epoch 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.587
Average training loss at batch  1000 : 1.433
Average training loss at batch  2000 : 1.450
Average training loss at batch  3000 : 1.872

Average training loss after epoch  6 : 1.635
Average validation loss after epoch  6 : 1.633
2019-11-21 16:12:44.126187 | Epoch 7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.731
Average training loss at batch  1000 : 1.393
Average training loss at batch  2000 : 1.558
Average training loss at batch  3000 : 1.585

Average training loss after epoch  7 : 1.635
Average validation loss after epoch  7 : 1.633
2019-11-21 16:12:49.228396 | Epoch 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.310
Average training loss at batch  1000 : 1.862
Average training loss at batch  2000 : 1.613
Average training loss at batch  3000 : 1.350

Average training loss after epoch  8 : 1.635
Average validation loss after epoch  8 : 1.633
2019-11-21 16:12:54.127615 | Epoch 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.503
Average training loss at batch  1000 : 1.480
Average training loss at batch  2000 : 1.730
Average training loss at batch  3000 : 1.436

Average training loss after epoch  9 : 1.635
Average validation loss after epoch  9 : 1.633


In [40]:
# #Only needed for Kaggle

# from IPython.display import FileLink, FileLinks 
# FileLinks('.') #lists all downloadable files on server

## Evaluate Model

To run this by itself, run:
- Get Dataloaders
- class definitions (model, clustering)

In [33]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

In [34]:
## This cell will change for each model
model_folder = 'baseline_frozen_glove/'

criterion = KMeansCriterion(1)
criterion = criterion.to(current_device)

path = os.getcwd()
model_dir = path + '/models/' + model_folder

opts = torch.load(model_dir+'opts')
model = neuralNetBow_glove(opts['embedding_matrix']) #change here depending on model
model.load_state_dict(torch.load(model_dir+'model_dict.pt',map_location=lambda storage, loc: storage))
model = model.to(current_device)
centroids = torch.load(model_dir+'centroids',map_location=lambda storage, loc: storage)

In [35]:
TP_cluster, FP_cluster=evaluation.main(model, centroids, val_loader, criterion, data_dir, current_device)

Total examples in val loader: 455
Assigned to cluster 1: 295
TP_rate: 0.8915254237288136
FP_rate: 0.10847457627118644
FN_rate: 0.85
TN_rate: 0.15


Accuracy: 0.5207627118644068
Precision: 0.8915254237288136
Recall: 0.5119221411192214
F1 score: 0.650386398763524


In [37]:
TP_cluster[TP_cluster.original == 0]

Unnamed: 0,review,index,flagged_word,assignment,original
5,excellent job in putting together what i envisioned .,4,together,1,0
21,this is my second order and will order again .,4,order,1,0
46,he took a vague concept and crafted a perfect logo for what we were looking for .,3,vague,1,0
59,creative fulfillment .,0,creative,1,0
82,"great designer , friendly , responsive , will order more work .",8,order,1,0
104,i shared the logo with co workers and they all loved it .,6,workers,1,0
105,dayalmodal3322 ( d ) provided the winning submission to my contest .,6,winning,1,0
154,great communication but my job could not be completed due to technical situation,11,technical,1,0
170,strong ability to execute feedback with great artistic vision .,6,great,1,0
179,very professional looking mascot and can adapt to unique ideas and requirements .,8,unique,1,0


In [38]:
FP_cluster[FP_cluster.original == 0]

Unnamed: 0,review,index,flagged_word,assignment,original
17,listened and created a great looking logo for us .,4,great,0,0
25,"i am very happy with all that transpired between the design firm and the winning artist , alicia .",11,firm,0,0
63,"even after lack of contact on my part , he was prompt in assisting me and making whatever changes i asked .",13,assisting,0,0
92,thank you very much and i look forward to working together again .,10,together,0,0
96,"really good , imaginative designer and pretty much nailed it first time",6,pretty,0,0
97,"the first submission i received was the eventual winner , with only minor changes suggested by me .",8,winner,0,0
106,"in this divided world we can only come together in our united pursuit of beauty in an artistic collaboration , and working with chris has been just that .",8,together,0,0
139,james did really great and went to great lengths to get it right for me .,7,great,0,0
143,will be working together in the future and am happy to have found them !,3,together,0,0
149,"nick is amazingly knowledgeable in seo , and helped me better understand how my competitors were acquiring links and improving their search visibility .",11,understand,0,0
