In [1]:
# ## KAGGLE ONLY
# from shutil import copyfile
# copyfile(src="../input/inputs/generate_dataloaders.py", dst="../working/generate_dataloaders.py")
# copyfile(src="../input/inputs/train_dataloader.p", dst="../working/train_dataloader.p")
# copyfile(src="../input/inputs/val_dataloader.p", dst="../working/val_dataloader.p")
# copyfile(src="../input/inputs/centroids_dataloader.p", dst="../working/ground_truth_dataloader.p")
# copyfile(src="../input/inputs/dictionary.p", dst="../working/dictionary.p")

# copyfile(src="../input/input2/train_unlabeld_dataloader.p", dst="../working/train_unlabelled_dataloader.p")
# copyfile(src="../input/input2/train_labeled_dataloader.p", dst="../working/train_labelled_dataloader.p")

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt
import pandas as pd
import random

from generate_dataloaders import *
from tqdm import tqdm_notebook as tqdm

import evaluation
import importlib
importlib.reload(evaluation)

<module 'evaluation' from '/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/evaluation.py'>

## Get Dataloaders

In [3]:
seed = 1029
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))

In [4]:
path = os.getcwd()
data_dir = path + '/data/'

In [5]:
train_loader = pkl.load(open(data_dir + 'train_dataloader.p','rb'))
train_loader_labelled = pkl.load(open(data_dir + 'train_labeled_dataloader.p','rb'))
train_loader_unlabelled = pkl.load(open(data_dir + 'train_unlabeled_dataloader.p','rb'))
val_loader = pkl.load(open(data_dir + 'val_dataloader.p','rb'))

In [6]:
#%conda install pytorch torchvision -c pytorch
## if torch.__version__ is not 1.3.1, run this cell then restart kernel

In [7]:
print(torch.__version__)

1.3.1


## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [8]:
class neuralNetBow(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, vocab_size, emb_dim, upweight=10):
        super(neuralNetBow, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.upweight = upweight
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
        
        # upweight by flagged_index
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff

In [9]:
class KMeansCriterion(nn.Module):
    
    def __init__(self, lmbda):
        super().__init__()
        self.lmbda = lmbda
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.min(1)
        loss = self.lmbda * cluster_distances.sum()
        return loss, cluster_assignments

In [10]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

## Training Function

In [19]:
def train_model(model, centroids, criterion, optimizer, train_loader, valid_loader, num_epochs=10, path_to_save=None, print_every = 1000):

    train_losses=[]
    val_losses=[]
    num_gpus = torch.cuda.device_count()
    if num_gpus > 0:
        current_device = 'cuda'
    else:
        current_device = 'cpu'
    
    for epoch in range(num_epochs):
        print('{} | Epoch {}'.format(dt.datetime.now(), epoch))
        model.train()
        k, d = centroids.size()
        centroid_sums = torch.zeros_like(centroids).to(current_device)
        centroid_counts = torch.zeros(k).to(current_device)
        total_epoch_loss = 0

        # run one epoch of gradient descent on autoencoders wrt centroids
        for i, (tokens, labels, flagged_indices) in tqdm(enumerate(train_loader)):
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)

            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids.detach())

            # run update step
            optimizer.zero_grad()
            cluster_loss.backward()
            optimizer.step()
            
            #Add loss to the epoch loss
            total_epoch_loss += cluster_loss.data

            # store centroid sums and counts in memory for later centering
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments, sentence_embed)

            if i % print_every == 0:
                losses = cluster_loss.data/len(tokens)
                print('Average training loss at batch ',i,': %.3f' % losses)
            
        total_epoch_loss /= len(train_loader.dataset)
        train_losses.append(total_epoch_loss)
        print('Average training loss after epoch ',epoch,': %.3f' % total_epoch_loss)
        
        # update centroids based on assignments from autoencoders
        centroids = centroid_sums / (centroid_counts[:, None] + 1).to(current_device)
        
        # calculate validation loss after every epoch
        total_validation_loss = 0
        for i, (tokens, labels, flagged_indices) in enumerate(valid_loader):
            model.eval()
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            #Add loss to the validation loss
            total_validation_loss += cluster_loss.data

        total_validation_loss /= len(valid_loader.dataset)
        val_losses.append(total_validation_loss)
        print('Average validation loss after epoch ',epoch,': %.3f' % total_validation_loss)
        
        if path_to_save == None:
            pass
        else:
            torch.save(model.state_dict(), path_to_save+'model_dict.pt')
            torch.save(centroids, path_to_save+'centroids')
            torch.save(train_losses, path_to_save+'train_losses')
            torch.save(val_losses, path_to_save+'val_losses')
            torch.save(opts, path_to_save+'opts') #change options depending on model inputs required
        
    return model, centroids, train_losses, val_losses

In [12]:
opts = {
    'vocab_size': 20000,
    'emb_dim': 512
}

In [13]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

model = neuralNetBow(opts['vocab_size'], opts['emb_dim']).to(current_device)

In [14]:
centroids = centroid_init(2, opts['emb_dim'],train_loader_labelled, model, current_device)
criterion = KMeansCriterion(1).to(current_device)
optimizer = torch.optim.Adam(model.parameters(), 0.01, amsgrad=True)

In [15]:
centroids

tensor([[ 0.1682,  0.0006, -0.0156,  ...,  0.1111, -0.0701, -0.0494],
        [ 0.3193,  0.0462, -0.0552,  ..., -0.0535, -0.0213,  0.1408]],
       grad_fn=<CloneBackward>)

In [16]:
current_device

'cpu'

In [17]:
path = os.getcwd()
model_folder = 'baseline_randomized_embeddings/'
model_dir = path + '/models/' + model_folder

In [20]:
baseline_model, baseline_centroids, baseline_train_losses, baseline_val_losses = train_model(model, centroids, criterion, optimizer, train_loader, val_loader, num_epochs=10, path_to_save=model_dir)


2019-11-21 15:41:31.233774 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 30.125
Average training loss at batch  1000 : 0.925
Average training loss at batch  2000 : 0.586
Average training loss at batch  3000 : 0.572

Average training loss after epoch  0 : 1.975
Average validation loss after epoch  0 : 0.213
2019-11-21 15:47:01.985168 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.596
Average training loss at batch  1000 : 0.082
Average training loss at batch  2000 : 0.085
Average training loss at batch  3000 : 0.076

Average training loss after epoch  1 : 0.149
Average validation loss after epoch  1 : 0.098
2019-11-21 15:53:33.331563 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.038
Average training loss at batch  1000 : 0.069
Average training loss at batch  2000 : 0.125
Average training loss at batch  3000 : 0.031

Average training loss after epoch  2 : 0.072
Average validation loss after epoch  2 : 0.069
2019-11-21 15:59:58.359746 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.040
Average training loss at batch  1000 : 0.026
Average training loss at batch  2000 : 0.131
Average training loss at batch  3000 : 0.025

Average training loss after epoch  3 : 0.045
Average validation loss after epoch  3 : 0.055
2019-11-21 16:06:37.852187 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.032
Average training loss at batch  1000 : 0.029
Average training loss at batch  2000 : 0.028
Average training loss at batch  3000 : 0.038

Average training loss after epoch  4 : 0.033
Average validation loss after epoch  4 : 0.048
2019-11-21 16:14:48.120149 | Epoch 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.019
Average training loss at batch  1000 : 0.022
Average training loss at batch  2000 : 0.028
Average training loss at batch  3000 : 0.016

Average training loss after epoch  5 : 0.025
Average validation loss after epoch  5 : 0.043
2019-11-21 16:23:01.526678 | Epoch 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.015
Average training loss at batch  1000 : 0.015
Average training loss at batch  2000 : 0.021
Average training loss at batch  3000 : 0.027

Average training loss after epoch  6 : 0.021
Average validation loss after epoch  6 : 0.040
2019-11-21 16:30:53.551171 | Epoch 7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.017
Average training loss at batch  1000 : 0.015
Average training loss at batch  2000 : 0.014
Average training loss at batch  3000 : 0.018

Average training loss after epoch  7 : 0.018
Average validation loss after epoch  7 : 0.038
2019-11-21 16:38:02.390890 | Epoch 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.047
Average training loss at batch  1000 : 0.013
Average training loss at batch  2000 : 0.019
Average training loss at batch  3000 : 0.016

Average training loss after epoch  8 : 0.016
Average validation loss after epoch  8 : 0.036
2019-11-21 16:44:23.735837 | Epoch 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.018
Average training loss at batch  1000 : 0.017
Average training loss at batch  2000 : 0.011
Average training loss at batch  3000 : 0.010

Average training loss after epoch  9 : 0.015
Average validation loss after epoch  9 : 0.035


In [21]:
model_dir

'/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/'

## Evaluation

In [22]:
## This cell will change for each model
model_folder = 'baseline_randomized_embeddings/'
criterion = KMeansCriterion(1)

model = model.to(current_device)
criterion = criterion.to(current_device)

#load model
path = os.getcwd()
model_dir = path + '/models/' + model_folder

opts = torch.load(model_dir+'opts')
model = neuralNetBow(opts['vocab_size'], opts['emb_dim']) #change according to model inputs
model.load_state_dict(torch.load(model_dir+'model_dict.pt',map_location=lambda storage, loc: storage))
centroids = torch.load(model_dir+'centroids',map_location=lambda storage, loc: storage)

In [23]:
TP_cluster, FP_cluster=evaluation.main(model, centroids, val_loader, criterion, data_dir, current_device)

Total examples in val loader: 454
Assigned to cluster 1: 209
TP_rate: 0.9521531100478469
FP_rate: 0.04784688995215311
FN_rate: 0.9521531100478469
TN_rate: 0.04784688995215311


Accuracy: 0.5
Precision: 0.9521531100478469
Recall: 0.5
F1 score: 0.6556836902800659
