In [2]:
# ## KAGGLE ONLY
# from shutil import copyfile
# copyfile(src="../input/inputs/generate_dataloaders.py", dst="../working/generate_dataloaders.py")
# copyfile(src="../input/inputs/train_dataloader.p", dst="../working/train_dataloader.p")
# copyfile(src="../input/inputs/val_dataloader.p", dst="../working/val_dataloader.p")
# copyfile(src="../input/inputs/centroids_dataloader.p", dst="../working/ground_truth_dataloader.p")
# copyfile(src="../input/inputs/dictionary.p", dst="../working/dictionary.p")

# copyfile(src="../input/input2/train_unlabeld_dataloader.p", dst="../working/train_unlabelled_dataloader.p")
# copyfile(src="../input/input2/train_labeled_dataloader.p", dst="../working/train_labelled_dataloader.p")

In [45]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt
import pandas as pd
import random

from generate_dataloaders import *
from tqdm import tqdm_notebook as tqdm

import evaluation
import importlib
importlib.reload(evaluation)

<module 'evaluation' from '/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/evaluation.py'>

## Get Dataloaders

In [6]:
seed = 1029
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))

In [7]:
path = os.getcwd()
data_dir = path + '/data/'

In [8]:
train_loader = pkl.load(open(data_dir + 'train_dataloader.p','rb'))
train_loader_labelled = pkl.load(open(data_dir + 'train_labeled_dataloader.p','rb'))
train_loader_unlabelled = pkl.load(open(data_dir + 'train_unlabeled_dataloader.p','rb'))
val_loader = pkl.load(open(data_dir + 'val_dataloader.p','rb'))

In [9]:
#%conda install pytorch torchvision -c pytorch
## if torch.__version__ is not 1.3.1, run this cell then restart kernel

In [10]:
print(torch.__version__)

1.3.1


## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [11]:
class neuralNetBow(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, opts):
        super(neuralNetBow, self).__init__()
        self.embed = nn.Embedding(opts['vocab_size'], opts['emb_dim'], padding_idx=0)
        self.upweight = opts['upweight']
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
        
        # upweight by flagged_index
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff

In [12]:
class KMeansCriterion(nn.Module):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.min(1)
        loss = cluster_distances.sum()
        return loss, cluster_assignments

In [13]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

## Training Function

In [26]:
def train_model(model, centroids, criterion, optimizer, train_loader, valid_loader, num_epochs=10, path_to_save=None, print_every = 1000):

    train_losses=[]
    val_losses=[]
    num_gpus = torch.cuda.device_count()
    if num_gpus > 0:
        current_device = 'cuda'
    else:
        current_device = 'cpu'
    
    for epoch in range(num_epochs):
        print('{} | Epoch {}'.format(dt.datetime.now(), epoch))
        model.train()
        k, d = centroids.size()
        centroid_sums = torch.zeros_like(centroids).to(current_device)
        centroid_counts = torch.zeros(k).to(current_device)
        total_epoch_loss = 0

        # run one epoch of gradient descent on autoencoders wrt centroids
        for i, (tokens, labels, flagged_indices) in tqdm(enumerate(train_loader)):
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)

            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids.detach())

            # run update step
            optimizer.zero_grad()
            cluster_loss.backward()
            optimizer.step()
            
            #Add loss to the epoch loss
            total_epoch_loss += cluster_loss.data

            # store centroid sums and counts in memory for later centering
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments, sentence_embed)

            if i % print_every == 0:
                losses = cluster_loss.data/len(tokens)
                print('Average training loss at batch ',i,': %.3f' % losses)
            
        total_epoch_loss /= len(train_loader.dataset)
        train_losses.append(total_epoch_loss)
        print('Average training loss after epoch ',epoch,': %.3f' % total_epoch_loss)
        
        # update centroids based on assignments from autoencoders
        centroids = centroid_sums / (centroid_counts[:, None] + 1).to(current_device)
        
        # calculate validation loss after every epoch
        total_validation_loss = 0
        for i, (tokens, labels, flagged_indices) in enumerate(valid_loader):
            model.eval()
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            #Add loss to the validation loss
            total_validation_loss += cluster_loss.data

        total_validation_loss /= len(valid_loader.dataset)
        val_losses.append(total_validation_loss)
        print('Average validation loss after epoch ',epoch,': %.3f' % total_validation_loss)
        
        if path_to_save == None:
            pass
        else:
            torch.save(model.state_dict(), path_to_save+'model_dict.pt')
            torch.save(centroids, path_to_save+'centroids')
            torch.save(train_losses, path_to_save+'train_losses')
            torch.save(val_losses, path_to_save+'val_losses')
            torch.save(opts, path_to_save+'opts') #change options depending on model inputs required
        
    return model, centroids, train_losses, val_losses

In [27]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'
print("Current Device:",current_device)

Current Device: cpu


## Hyperparameter Tuning Loop

In [22]:
def get_save_directory(opts):
    path = os.getcwd()
    model_folder = 'baseline_randomized_embeddings/'
    model_dir = path + '/models/' + model_folder
    
    # subfolder for each hyperparam config
    emb_dim = opts['emb_dim']
    upweight = opts['upweight']
    subfolder = "emb_dim="+str(emb_dim) + ",upweight="+str(upweight) + '/'
    
    # need to actually create these subfolders lol
    try:
        os.makedirs(model_dir + subfolder) # will throw error if subfolder already exists
    except:
        pass
    
    return model_dir + subfolder

In [38]:
def train_config(opts):
    model = neuralNetBow(opts).to(current_device)
    centroids = centroid_init(2, opts['emb_dim'],train_loader_labelled, model, current_device)
    criterion = KMeansCriterion().to(current_device)
    optimizer = torch.optim.Adam(model.parameters(), 0.01, amsgrad=True)
    path_to_save = get_save_directory(opts)
    print(path_to_save)
    
    train_model(model, centroids, criterion, optimizer, train_loader, val_loader, num_epochs=10, path_to_save=path_to_save)

In [42]:
#emb_dims = [128, 256, 512]
emb_dims = [128]
upweights = [1, 5, 10, 25]
## TO-DO: 512 @ 5, 10, 25

for emb_dim in emb_dims:
    for upweight in upweights:
        opts = {
            'vocab_size': 20000,
            'emb_dim': emb_dim,
            'upweight': upweight
        }
        train_config(opts)

/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=1/
2019-12-05 23:16:18.781605 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.427
Average training loss at batch  1000 : 0.122
Average training loss at batch  2000 : 0.082
Average training loss at batch  3000 : 0.079

Average training loss after epoch  0 : 0.183
Average validation loss after epoch  0 : 0.061
2019-12-05 23:17:34.991197 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.025
Average training loss at batch  1000 : 0.035
Average training loss at batch  2000 : 0.065
Average training loss at batch  3000 : 0.042

Average training loss after epoch  1 : 0.038
Average validation loss after epoch  1 : 0.039
2019-12-05 23:19:07.386495 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.029
Average training loss at batch  1000 : 0.033
Average training loss at batch  2000 : 0.022
Average training loss at batch  3000 : 0.012

Average training loss after epoch  2 : 0.023
Average validation loss after epoch  2 : 0.032
2019-12-05 23:20:39.834752 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.018
Average training loss at batch  1000 : 0.017
Average training loss at batch  2000 : 0.011
Average training loss at batch  3000 : 0.018

Average training loss after epoch  3 : 0.017
Average validation loss after epoch  3 : 0.027
2019-12-05 23:22:10.936813 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.016
Average training loss at batch  1000 : 0.009
Average training loss at batch  2000 : 0.020
Average training loss at batch  3000 : 0.037

Average training loss after epoch  4 : 0.014
Average validation loss after epoch  4 : 0.024
2019-12-05 23:23:42.009512 | Epoch 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.012
Average training loss at batch  1000 : 0.012
Average training loss at batch  2000 : 0.013
Average training loss at batch  3000 : 0.011

Average training loss after epoch  5 : 0.011
Average validation loss after epoch  5 : 0.022
2019-12-05 23:25:13.258699 | Epoch 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.008
Average training loss at batch  1000 : 0.011
Average training loss at batch  2000 : 0.011
Average training loss at batch  3000 : 0.012

Average training loss after epoch  6 : 0.010
Average validation loss after epoch  6 : 0.018
2019-12-05 23:26:45.595663 | Epoch 7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.007
Average training loss at batch  1000 : 0.007
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.005

Average training loss after epoch  7 : 0.006
Average validation loss after epoch  7 : 0.016
2019-12-05 23:28:16.775837 | Epoch 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.007
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.005
Average training loss at batch  3000 : 0.005

Average training loss after epoch  8 : 0.005
Average validation loss after epoch  8 : 0.015
2019-12-05 23:29:49.046968 | Epoch 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.007
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.006
Average training loss at batch  3000 : 0.004

Average training loss after epoch  9 : 0.004
Average validation loss after epoch  9 : 0.015
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=5/
2019-12-05 23:31:24.273907 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 3.186
Average training loss at batch  1000 : 0.223
Average training loss at batch  2000 : 0.106
Average training loss at batch  3000 : 0.053

Average training loss after epoch  0 : 0.263
Average validation loss after epoch  0 : 0.055
2019-12-05 23:32:37.034933 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.040
Average training loss at batch  1000 : 0.035
Average training loss at batch  2000 : 0.052
Average training loss at batch  3000 : 0.042

Average training loss after epoch  1 : 0.029
Average validation loss after epoch  1 : 0.031
2019-12-05 23:34:06.456851 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.015
Average training loss at batch  1000 : 0.011
Average training loss at batch  2000 : 0.009
Average training loss at batch  3000 : 0.014

Average training loss after epoch  2 : 0.015
Average validation loss after epoch  2 : 0.023
2019-12-05 23:35:33.373836 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.015
Average training loss at batch  1000 : 0.011
Average training loss at batch  2000 : 0.010
Average training loss at batch  3000 : 0.008

Average training loss after epoch  3 : 0.010
Average validation loss after epoch  3 : 0.020
2019-12-05 23:37:02.706325 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.007
Average training loss at batch  1000 : 0.010
Average training loss at batch  2000 : 0.011
Average training loss at batch  3000 : 0.005

Average training loss after epoch  4 : 0.008
Average validation loss after epoch  4 : 0.018
2019-12-05 23:38:31.458399 | Epoch 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.007
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.006
Average training loss at batch  3000 : 0.007

Average training loss after epoch  5 : 0.006
Average validation loss after epoch  5 : 0.016
2019-12-05 23:40:02.736497 | Epoch 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.004
Average training loss at batch  1000 : 0.004
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.005

Average training loss after epoch  6 : 0.005
Average validation loss after epoch  6 : 0.015
2019-12-05 23:41:32.914579 | Epoch 7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.003
Average training loss at batch  1000 : 0.003
Average training loss at batch  2000 : 0.002
Average training loss at batch  3000 : 0.004

Average training loss after epoch  7 : 0.005
Average validation loss after epoch  7 : 0.015
2019-12-05 23:43:02.190070 | Epoch 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.004
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.007
Average training loss at batch  3000 : 0.004

Average training loss after epoch  8 : 0.004
Average validation loss after epoch  8 : 0.014
2019-12-05 23:44:29.969430 | Epoch 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.004
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.003
Average training loss at batch  3000 : 0.003

Average training loss after epoch  9 : 0.004
Average validation loss after epoch  9 : 0.014
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=10/
2019-12-05 23:45:59.683326 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 7.558
Average training loss at batch  1000 : 0.336
Average training loss at batch  2000 : 0.205
Average training loss at batch  3000 : 0.048

Average training loss after epoch  0 : 0.492
Average validation loss after epoch  0 : 0.070
2019-12-05 23:47:10.701858 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.051
Average training loss at batch  1000 : 0.025
Average training loss at batch  2000 : 0.018
Average training loss at batch  3000 : 0.012

Average training loss after epoch  1 : 0.037
Average validation loss after epoch  1 : 0.035
2019-12-05 23:48:39.520133 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.014
Average training loss at batch  1000 : 0.017
Average training loss at batch  2000 : 0.016
Average training loss at batch  3000 : 0.012

Average training loss after epoch  2 : 0.017
Average validation loss after epoch  2 : 0.025
2019-12-05 23:50:09.659534 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.008
Average training loss at batch  1000 : 0.008
Average training loss at batch  2000 : 0.011
Average training loss at batch  3000 : 0.007

Average training loss after epoch  3 : 0.011
Average validation loss after epoch  3 : 0.021
2019-12-05 23:51:40.020133 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.004
Average training loss at batch  1000 : 0.010
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.006

Average training loss after epoch  4 : 0.008
Average validation loss after epoch  4 : 0.019
2019-12-05 23:53:26.789986 | Epoch 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.006
Average training loss at batch  1000 : 0.009
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.005

Average training loss after epoch  5 : 0.006
Average validation loss after epoch  5 : 0.017
2019-12-05 23:55:01.028511 | Epoch 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.004
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.004

Average training loss after epoch  6 : 0.005
Average validation loss after epoch  6 : 0.016
2019-12-05 23:56:31.806807 | Epoch 7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.005
Average training loss at batch  1000 : 0.003
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.004

Average training loss after epoch  7 : 0.004
Average validation loss after epoch  7 : 0.015
2019-12-05 23:58:00.238654 | Epoch 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.003
Average training loss at batch  1000 : 0.003
Average training loss at batch  2000 : 0.003
Average training loss at batch  3000 : 0.003

Average training loss after epoch  8 : 0.004
Average validation loss after epoch  8 : 0.015
2019-12-05 23:59:30.458972 | Epoch 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.003
Average training loss at batch  1000 : 0.003
Average training loss at batch  2000 : 0.002
Average training loss at batch  3000 : 0.003

Average training loss after epoch  9 : 0.003
Average validation loss after epoch  9 : 0.014
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=25/
2019-12-06 00:01:00.472386 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 20.350
Average training loss at batch  1000 : 0.554
Average training loss at batch  2000 : 0.064
Average training loss at batch  3000 : 0.050

Average training loss after epoch  0 : 1.289
Average validation loss after epoch  0 : 0.174
2019-12-06 00:02:11.605809 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.106
Average training loss at batch  1000 : 0.036
Average training loss at batch  2000 : 0.019
Average training loss at batch  3000 : 0.016

Average training loss after epoch  1 : 0.066
Average validation loss after epoch  1 : 0.092
2019-12-06 00:03:42.154046 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.016
Average training loss at batch  1000 : 0.011
Average training loss at batch  2000 : 0.012
Average training loss at batch  3000 : 0.017

Average training loss after epoch  2 : 0.028
Average validation loss after epoch  2 : 0.069
2019-12-06 00:05:10.859954 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.009
Average training loss at batch  1000 : 0.008
Average training loss at batch  2000 : 0.009
Average training loss at batch  3000 : 0.012

Average training loss after epoch  3 : 0.016
Average validation loss after epoch  3 : 0.059
2019-12-06 00:06:39.311327 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.007
Average training loss at batch  1000 : 0.013
Average training loss at batch  2000 : 0.005
Average training loss at batch  3000 : 0.006

Average training loss after epoch  4 : 0.011
Average validation loss after epoch  4 : 0.052
2019-12-06 00:08:08.191991 | Epoch 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.005
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.049
Average training loss at batch  3000 : 0.003

Average training loss after epoch  5 : 0.008
Average validation loss after epoch  5 : 0.048
2019-12-06 00:09:40.164021 | Epoch 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.006
Average training loss at batch  1000 : 0.005
Average training loss at batch  2000 : 0.003
Average training loss at batch  3000 : 0.003

Average training loss after epoch  6 : 0.006
Average validation loss after epoch  6 : 0.046
2019-12-06 00:11:08.347090 | Epoch 7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.002
Average training loss at batch  1000 : 0.003
Average training loss at batch  2000 : 0.003
Average training loss at batch  3000 : 0.003

Average training loss after epoch  7 : 0.005
Average validation loss after epoch  7 : 0.043
2019-12-06 00:12:38.277840 | Epoch 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.002
Average training loss at batch  1000 : 0.049
Average training loss at batch  2000 : 0.004
Average training loss at batch  3000 : 0.002

Average training loss after epoch  8 : 0.004
Average validation loss after epoch  8 : 0.040
2019-12-06 00:14:07.619510 | Epoch 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 0.002
Average training loss at batch  1000 : 0.002
Average training loss at batch  2000 : 0.003
Average training loss at batch  3000 : 0.003

Average training loss after epoch  9 : 0.003
Average validation loss after epoch  9 : 0.038


## Evaluation

In [14]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

In [62]:
def evaluate_config(opts,verbose=True):
    path_to_save = get_save_directory(opts)
    print(path_to_save)
    
    model = neuralNetBow(opts) #change according to model inputs
    model.load_state_dict(torch.load(path_to_save+'model_dict.pt',map_location=lambda storage, loc: storage))
    model = model.to(current_device)
    criterion = KMeansCriterion().to(current_device)
    centroids = torch.load(path_to_save+'centroids',map_location=lambda storage, loc: storage)
    
    TP_cluster, FP_cluster, results_dict = evaluation.main(model, centroids, val_loader, criterion, data_dir, current_device, verbose)
    results_dict.update(opts)
    return TP_cluster, FP_cluster, results_dict

In [64]:
emb_dims = [128, 256, 512]
upweights = [1, 5, 10, 25]

results_df = pd.DataFrame()
for emb_dim in emb_dims:
    for upweight in upweights:
        opts = {
            'vocab_size': 20000,
            'emb_dim': emb_dim,
            'upweight': upweight
        }
        _, _, results_dict = evaluate_config(opts,False)
        results_df = results_df.append(results_dict,ignore_index=True)
        
results_df = results_df[['emb_dim','upweight','Accuracy','F1 score','Precision','Recall',
                        'TP_rate','FP_rate','FN_rate','TN_rate']]

/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=1/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=5/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=10/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=128,upweight=25/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=256,upweight=1/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=256,upweight=5/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=256,upweight=10/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=256,upweight=25/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_randomized_embeddings/emb_dim=512,upweight=1/
/Users

In [65]:
results_df

Unnamed: 0,emb_dim,upweight,Accuracy,F1 score,Precision,Recall,TP_rate,FP_rate,FN_rate,TN_rate
0,128.0,1.0,0.510688,0.64516,0.889655,0.50608,0.889655,0.110345,0.86828,0.13172
1,128.0,5.0,0.583729,0.700171,0.972093,0.547125,0.972093,0.027907,0.804636,0.195364
2,128.0,10.0,0.577325,0.696435,0.969697,0.543326,0.969697,0.030303,0.815047,0.184953
3,128.0,25.0,0.578963,0.696697,0.967136,0.544452,0.967136,0.032864,0.809211,0.190789
4,256.0,1.0,0.517739,0.6513,0.900763,0.510045,0.900763,0.099237,0.865285,0.134715
5,256.0,5.0,0.57623,0.695786,0.969231,0.542682,0.969231,0.030769,0.81677,0.18323
6,256.0,10.0,0.588508,0.702907,0.973568,0.550001,0.973568,0.026432,0.796552,0.203448
7,256.0,25.0,0.579171,0.697522,0.970443,0.544414,0.970443,0.029557,0.812102,0.187898
8,512.0,1.0,0.513298,0.647399,0.893617,0.507553,0.893617,0.106383,0.867021,0.132979
9,512.0,5.0,0.558636,0.685458,0.961832,0.53246,0.961832,0.038168,0.84456,0.15544


In [67]:
results_df.sort_values(['F1 score'],ascending=False).head(5)

Unnamed: 0,emb_dim,upweight,Accuracy,F1 score,Precision,Recall,TP_rate,FP_rate,FN_rate,TN_rate
6,256.0,10.0,0.588508,0.702907,0.973568,0.550001,0.973568,0.026432,0.796552,0.203448
1,128.0,5.0,0.583729,0.700171,0.972093,0.547125,0.972093,0.027907,0.804636,0.195364
7,256.0,25.0,0.579171,0.697522,0.970443,0.544414,0.970443,0.029557,0.812102,0.187898
3,128.0,25.0,0.578963,0.696697,0.967136,0.544452,0.967136,0.032864,0.809211,0.190789
2,128.0,10.0,0.577325,0.696435,0.969697,0.543326,0.969697,0.030303,0.815047,0.184953


# Save Embeddings for Plot

In [None]:
save_dir = path + '/umap/' + model_folder

In [None]:
# make an embedding on validation set including centroids
val_embed_labelled = []
val_labels_lst = []

for i, (tokens, labels, flagged_indices) in enumerate(val_loader):
    model.eval()
    tokens = tokens.to(current_device)
    labels = labels.to(current_device)
    flagged_indices = flagged_indices.to(current_device)

    # forward pass and compute loss
    sentence_embed = model(tokens,flagged_indices)

    val_embed_labelled+= sentence_embed.tolist()    
    val_labels_lst+=labels.tolist()
val_embed_labelled += centroids.tolist()
val_labels_lst += [0,1]

In [None]:
# make an embedding on training set
embed_labelled = []
labels_lst = []

for i, (tokens, labels, flagged_indices) in enumerate(train_loader_labelled):
    model.eval()
    tokens = tokens.to(current_device)
    labels = labels.to(current_device)
    flagged_indices = flagged_indices.to(current_device)

    # forward pass and compute loss
    sentence_embed = model(tokens,flagged_indices)

    embed_labelled+= sentence_embed.tolist()    
    labels_lst+=labels.tolist()

In [None]:
pickle_out1 = open(save_dir + "val_embed_labelled.pickle","wb")
pickle.dump(val_embed_labelled, pickle_out1)
pickle_out1.close()

pickle_out2 = open(save_dir + "val_labels_lst.pickle","wb")
pickle.dump(val_labels_lst, pickle_out2)
pickle_out2.close()

pickle_out3 = open(save_dir + "embed_labelled.pickle","wb")
pickle.dump(embed_labelled, pickle_out3)
pickle_out3.close()

pickle_out4 = open(save_dir + "labels.pickle","wb")
pickle.dump(labels_lst, pickle_out4)
pickle_out4.close()