In [1]:
# ## KAGGLE ONLY
# from shutil import copyfile
# copyfile(src="../input/inputs/generate_dataloaders.py", dst="../working/generate_dataloaders.py")
# copyfile(src="../input/inputs/train_dataloader.p", dst="../working/train_dataloader.p")
# copyfile(src="../input/inputs/val_dataloader.p", dst="../working/val_dataloader.p")
# copyfile(src="../input/inputs/centroids_dataloader.p", dst="../working/ground_truth_dataloader.p")
# copyfile(src="../input/inputs/dictionary.p", dst="../working/dictionary.p")

# copyfile(src="../input/input2/train_unlabeld_dataloader.p", dst="../working/train_unlabelled_dataloader.p")
# copyfile(src="../input/input2/train_labeled_dataloader.p", dst="../working/train_labelled_dataloader.p")

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt
import pandas as pd
import random

from generate_dataloaders import *

from generate_dataloaders import *
from tqdm import tqdm_notebook as tqdm

import evaluation
import importlib
importlib.reload(evaluation)

<module 'evaluation' from '/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/evaluation.py'>

## Get Dataloaders

In [3]:
seed = 1029
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))

In [4]:
path = os.getcwd()
data_dir = path + '/data/'

In [5]:
train_loader = pkl.load(open(data_dir + 'train_dataloader.p','rb'))
train_loader_labelled = pkl.load(open(data_dir + 'train_labeled_dataloader.p','rb'))
train_loader_unlabelled = pkl.load(open(data_dir + 'train_unlabeled_dataloader.p','rb'))
val_loader = pkl.load(open(data_dir + 'val_dataloader.p','rb'))

In [6]:
#%conda install pytorch torchvision -c pytorch
## if torch.__version__ is not 1.3.1, run this cell then restart kernel

In [7]:
print(torch.__version__)

1.3.1


## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [8]:
class neuralNetBow(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, opts):
        super(neuralNetBow, self).__init__()
        self.embed = nn.Embedding(opts['vocab_size'], opts['emb_dim'], padding_idx=0)
        self.upweight = opts['upweight']
        self.lambda_loss = opts['lambda_loss']
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
        
        # upweight by flagged_index
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff

In [9]:
class KMeansCriterion(nn.Module):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, embeddings, centroids, labelled = False,  cluster_assignments = None):
        if labelled:
            num_reviews = len(cluster_assignments)
            distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
            cluster_distances = distances[list(range(num_reviews)),cluster_assignments]
            loss = cluster_distances.sum()
        else:
            distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
            cluster_distances, cluster_assignments = distances.min(1)
            loss = cluster_distances.sum()
        return loss, cluster_assignments

In [10]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

## Dataloader stuff

In [11]:
def loadLabelledBatch(train_loader_labelled_iter, train_loader_labelled):
    try:
        tokens, labels, flagged_indices = next(train_loader_labelled_iter)
    except StopIteration:
        train_loader_labelled_iter = iter(train_loader_labelled)
        tokens, labels, flagged_indices = next(train_loader_labelled_iter)

    return tokens, labels, flagged_indices, train_loader_labelled_iter


def loadUnlabelledBatch(train_loader_unlabelled_iter, train_loader_unlabelled):
    try:
        tokens, labels, flagged_indices = next(train_loader_unlabelled_iter)
    except StopIteration:
        train_loader_unlabelled_iter = iter(train_loader_unlabelled)
        tokens, labels, flagged_indices = next(train_loader_unlabelled_iter)

    return tokens, labels, flagged_indices, train_loader_unlabelled_iter

## Training Function

In [26]:
def train_model(model, centroids, criterion, optimizer, train_loader_labelled, train_loader_unlabelled, valid_loader, num_epochs=10, num_batches = 1000, path_to_save=None, print_every = 1000):

    train_loader_labelled_iter = iter(train_loader_labelled)
    train_loader_unlabelled_iter = iter(train_loader_unlabelled)
    lambda_loss = model.lambda_loss

    train_losses=[]
    val_losses=[]
    num_gpus = torch.cuda.device_count()
    if num_gpus > 0:
        current_device = 'cuda'
    else:
        current_device = 'cpu'
    
    for epoch in tqdm(range(num_epochs)):
        print('{} | Epoch {}'.format(dt.datetime.now(), epoch))
        model.train()
        k, d = centroids.size()
        centroid_sums = torch.zeros_like(centroids).to(current_device)
        centroid_counts = torch.zeros(k).to(current_device)
        total_epoch_loss = 0
        
        for i in range(num_batches):
            tokens_labelled, labels, flagged_indices_labelled, train_loader_labelled_iter = loadLabelledBatch(train_loader_labelled_iter, train_loader_labelled)
            tokens_unlabelled, _, flagged_indices_unlabelled, train_loader_unlabelled_iter = loadUnlabelledBatch(train_loader_unlabelled_iter, train_loader_unlabelled)

            tokens_labelled = tokens_labelled.to(current_device)
            labels = labels.to(current_device)
            flagged_indices_labelled = flagged_indices_labelled.to(current_device)
            
            tokens_unlabelled = tokens_unlabelled.to(current_device)
            flagged_indices_unlabelled = flagged_indices_unlabelled.to(current_device)

            # forward pass and compute loss
            sentence_embed_labelled = model(tokens_labelled,flagged_indices_labelled)
            sentence_embed_unlabelled = model(tokens_unlabelled,flagged_indices_unlabelled)
            
            cluster_loss_unlabelled, cluster_assignments_unlabelled = criterion(sentence_embed_unlabelled, centroids.detach())
            cluster_loss_labelled, cluster_assignments_labelled = criterion(sentence_embed_labelled, centroids.detach(), labelled = True, cluster_assignments = labels)
            
            total_batch_loss = cluster_loss_unlabelled + lambda_loss * cluster_loss_labelled
        
            # run update step
            optimizer.zero_grad()
            total_batch_loss.backward()
            optimizer.step()
            
            #Add loss to the epoch loss
            total_epoch_loss += total_batch_loss

            # store centroid sums and counts in memory for later centering
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments_labelled, sentence_embed_labelled)
    
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments_unlabelled, sentence_embed_unlabelled)

            if i % print_every == 0:
                losses = total_batch_loss/(len(tokens_labelled)+ len(tokens_unlabelled))
                print('Average training loss at batch ',i,': %.3f' % losses)
            
        total_epoch_loss /= (len(train_loader_labelled.dataset)+len(train_loader_unlabelled.dataset))
        train_losses.append(total_epoch_loss)
        print('Average training loss after epoch ',epoch,': %.3f' % total_epoch_loss)
        
        # update centroids based on assignments from autoencoders
        centroids = centroid_sums / (centroid_counts[:, None] + 1).to(current_device)
        
        # calculate validation loss after every epoch
        total_validation_loss = 0
        for i, (tokens, labels, flagged_indices) in enumerate(valid_loader):
            model.eval()
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            #Add loss to the validation loss
            total_validation_loss += cluster_loss.data

        total_validation_loss /= len(valid_loader.dataset)
        val_losses.append(total_validation_loss)
        print('Average validation loss after epoch ',epoch,': %.3f' % total_validation_loss)
        
        if path_to_save == None:
            pass
        else:
            sub_folder = "/"
            path_to_save+=sub_folder
            torch.save(model.state_dict(), path_to_save+'model_dict.pt')
            torch.save(centroids, path_to_save+'centroids')
            torch.save(train_losses, path_to_save+'train_losses')
            torch.save(val_losses, path_to_save+'val_losses')
            torch.save(opts, path_to_save+'opts') #change options depending on model inputs required
        
    return model, centroids, train_losses, val_losses

In [27]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'
print("Current Device:",current_device)

Current Device: cpu


## Hyperparameter Tuning Loop

In [28]:
def get_save_directory(opts):
    path = os.getcwd()
    model_folder = 'baseline_semisupervised_randomized_embeddings/'
    model_dir = path + '/models/' + model_folder
    
    # subfolder for each hyperparam config
    emb_dim = opts['emb_dim']
    upweight = opts['upweight']
    lambda_loss = opts['lambda_loss']
    subfolder = "emb_dim="+str(emb_dim) + ",upweight="+str(upweight) + ",lambda="+str(lambda_loss) + '/'
    
    # need to actually create these subfolders lol
    try:
        os.makedirs(model_dir + subfolder) # will throw error if subfolder already exists
    except:
        pass
    
    return model_dir + subfolder

In [29]:
def train_config(opts):
    model = neuralNetBow(opts).to(current_device)
    centroids = centroid_init(2, opts['emb_dim'],train_loader_labelled, model, current_device)
    criterion = KMeansCriterion().to(current_device)
    optimizer = torch.optim.Adam(model.parameters(), 0.01, amsgrad=True)
    path_to_save = get_save_directory(opts)
    print(path_to_save)
    
    train_model(model, centroids, criterion, optimizer, train_loader_labelled, train_loader_unlabelled, val_loader, num_epochs=10, path_to_save=path_to_save)

In [39]:
emb_dims = [128, 256, 512]
upweights = [1, 5, 10, 25]
lambda_losses = [.1, .5, 1, 5, 10, 25]

for emb_dim in emb_dims:
    for upweight in upweights:
        for lambda_loss in lambda_losses:
            opts = {
                'vocab_size': 20000,
                'emb_dim': emb_dim,
                'upweight': upweight,
                'lambda_loss': lambda_loss
            }
            train_config(opts)

/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=0.1/


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

2019-12-07 21:05:50.196100 | Epoch 0
Average training loss at batch  0 : 8.240
Average training loss after epoch  0 : 0.815
Average validation loss after epoch  0 : 0.653
2019-12-07 21:06:41.095764 | Epoch 1
Average training loss at batch  0 : 0.370
Average training loss after epoch  1 : 0.111
Average validation loss after epoch  1 : 0.255
2019-12-07 21:07:36.504812 | Epoch 2
Average training loss at batch  0 : 0.128
Average training loss after epoch  2 : 0.054
Average validation loss after epoch  2 : 0.156
2019-12-07 21:08:39.336000 | Epoch 3
Average training loss at batch  0 : 0.044
Average training loss after epoch  3 : 0.031
Average validation loss after epoch  3 : 0.124
2019-12-07 21:09:46.757005 | Epoch 4
Average training loss at batch  0 : 0.026
Average training loss after epoch  4 : 0.022
Average validation loss after epoch  4 : 0.104
2019-12-07 21:10:55.624334 | Epoch 5
Average training loss at batch  0 : 0.035
Average training loss after epoch  5 : 0.017
Average validation lo

## Evaluation

In [40]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

In [41]:
def evaluate_config(opts,verbose=True):
    path_to_save = get_save_directory(opts)
    print(path_to_save)
    
    model = neuralNetBow(opts) #change according to model inputs
    model.load_state_dict(torch.load(path_to_save+'model_dict.pt',map_location=lambda storage, loc: storage))
    model = model.to(current_device)
    criterion = KMeansCriterion().to(current_device)
    centroids = torch.load(path_to_save+'centroids',map_location=lambda storage, loc: storage)
    
    TP_cluster, FP_cluster, results_dict = evaluation.main(model, centroids, val_loader, criterion, data_dir, current_device, verbose)
    results_dict.update(opts)
    return TP_cluster, FP_cluster, results_dict

In [43]:
emb_dims = [256]
upweights = [10]
lambda_losses = [.1, .5, 1, 5, 10, 25]

results_df = pd.DataFrame()
for emb_dim in emb_dims:
    for upweight in upweights:
        for lambda_loss in lambda_losses:
            opts = {
                'vocab_size': 20000,
                'emb_dim': emb_dim,
                'upweight': upweight,
                'lambda_loss': lambda_loss
            }
            _, _, results_dict = evaluate_config(opts,False)
            results_df = results_df.append(results_dict,ignore_index=True)
        
results_df = results_df[['emb_dim','upweight','lambda_loss','Accuracy','F1 score','Precision','Recall',
                        'TP_rate','FP_rate','FN_rate','TN_rate']]

/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=0.1/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=0.5/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=1/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=5/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=10/
/Users/elliotsilva/Desktop/DS-GA-1006/FairFrame/models/baseline_semisupervised_randomized_embeddings/emb_dim=256,upweight=10,lambda=25/


In [44]:
results_df

Unnamed: 0,emb_dim,upweight,lambda_loss,Accuracy,F1 score,Precision,Recall,TP_rate,FP_rate,FN_rate,TN_rate
0,256.0,10.0,0.1,0.57629,0.691055,0.947761,0.543771,0.947761,0.052239,0.795181,0.204819
1,256.0,10.0,0.5,0.725815,0.778135,0.961631,0.653446,0.961631,0.038369,0.51,0.49
2,256.0,10.0,1.0,0.6761,0.745447,0.948529,0.613991,0.948529,0.051471,0.59633,0.40367
3,256.0,10.0,5.0,0.634167,0.71877,0.935,0.583767,0.935,0.065,0.666667,0.333333
4,256.0,10.0,10.0,0.621915,0.710529,0.92804,0.575618,0.92804,0.07196,0.684211,0.315789
5,256.0,10.0,25.0,0.626044,0.713291,0.930348,0.578356,0.930348,0.069652,0.678261,0.321739
