In [124]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt

from generate_dataloaders import *

from tqdm import tqdm_notebook as tqdm





55it [00:23, 11.39it/s][A[A[A[A

## Get Dataloaders

In [89]:
def get_dataloaders(train_filename,val_filename):
    path = os.getcwd()
    data_dir = path + '/data/'
    train_dataloader = pkl.load(open(data_dir + train_filename,'rb'))
    val_dataloader = pkl.load(open(data_dir + val_filename,'rb'))
    return train_dataloader,val_dataloader

In [90]:
path = os.getcwd()
data_dir = path + '/data/'

In [91]:
train_loader,val_loader = get_dataloaders('train_dataloader.p','val_dataloader.p')

In [92]:
ground_truth_dataloader = pkl.load(open(data_dir + 'ground_truth_dataloader.p','rb'))

In [93]:
print(torch.__version__)

1.0.0


## Scratchwork (IGNORE)

In [None]:
for i,x in enumerate(train_loader):
    print(len(x[0]))
    break

32


In [None]:
minibatch = torch.tensor([
                            [[1,2,3,4,5],[3,3,3,3,3],[1,1,1,1,1],[2,1,2,1,2]],
                            [[0,1,0,1,0],[1,1,1,1,1],[2,0,0,0,0],[0,0,0,0,2]]
                         ], dtype=torch.float32)

flagged_indices = torch.tensor([1,2])

upweight_value = 10

print(minibatch.shape)
print(minibatch)

print(flagged_indices.shape)
print(flagged_indices)

torch.Size([2, 4, 5])
tensor([[[1., 2., 3., 4., 5.],
         [3., 3., 3., 3., 3.],
         [1., 1., 1., 1., 1.],
         [2., 1., 2., 1., 2.]],

        [[0., 1., 0., 1., 0.],
         [1., 1., 1., 1., 1.],
         [2., 0., 0., 0., 0.],
         [0., 0., 0., 0., 2.]]])
torch.Size([2])
tensor([1, 2])


In [None]:
batch_size, num_tokens, emb_dim = minibatch.shape
print(type(minibatch))
minibatch[range(batch_size),flagged_indices,:] *= upweight_value
print(batch_size, num_tokens, emb_dim)
minibatch

<class 'torch.Tensor'>
2 4 5


tensor([[[ 1.,  2.,  3.,  4.,  5.],
         [30., 30., 30., 30., 30.],
         [ 1.,  1.,  1.,  1.,  1.],
         [ 2.,  1.,  2.,  1.,  2.]],

        [[ 0.,  1.,  0.,  1.,  0.],
         [ 1.,  1.,  1.,  1.,  1.],
         [20.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  2.]]])

In [None]:
minibatch.sum(1) / (num_tokens + upweight_value - 1)

tensor([[2.6154, 2.6154, 2.7692, 2.7692, 2.9231],
        [1.6154, 0.1538, 0.0769, 0.1538, 0.2308]])

In [None]:
print(type(minibatch))

<class 'torch.Tensor'>


In [None]:
embed = torch.tensor(np.array([[2,4,5,6],[1,3,45,7],[3,4,5,6]]))

In [None]:
centers = torch.tensor(np.array(([2,3,4,5],[1,2,4,5])))

In [None]:
torch.sum((embed[:,None,:]-centers)**2,2)

tensor([[   3,    7],
        [1686, 1686],
        [   4,   10]])

In [None]:
cluster_distances, cluster_assignments = torch.sum((embed[:,None,:]-centers)**2, 2).min(1)
cluster_assignments

tensor([0, 1, 0])

In [None]:
for i, (tokens, labels, flagged_indices) in enumerate(train_loader):
    #print(tokens, labels, flagged_indices)
    break

In [56]:
cluster_assts = torch.LongTensor([1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 0, 0, 0, 1])
k = 2
bin_counts = torch.bincount(cluster_assts,minlength=k)

In [72]:
bin_counts = bin_counts.type(torch.FloatTensor).to(current_device)
bin_counts

tensor([16., 16.])

## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [94]:
class neuralNetBow(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, vocab_size, emb_dim, upweight=10):
        super(neuralNetBow, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=2)
        self.upweight = upweight
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
#         print(embedding.shape) # below assumes "batch_size x num_tokens x Emb_dim" (VERIFY)
        
        # upweight by flagged_index
#         print(type(embedding))
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff (un-tailored)

In [95]:
class KMeansCriterion(nn.Module):
    
    def __init__(self, lmbda):
        super().__init__()
        self.lmbda = lmbda
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.min(1)
        loss = self.lmbda * cluster_distances.sum()
        return loss, cluster_assignments

In [96]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

### Training Function (un-tailored, needs alterations)

In [132]:
def train_model(model, centroids, criterion, optimizer, train_loader, valid_loader, num_epochs=10, path_to_save=None, print_every = 1000):

    train_losses=[]
    val_losses=[]
    num_gpus = torch.cuda.device_count()
    if num_gpus > 0:
        current_device = 'cuda'
    else:
        current_device = 'cpu'
    
    for epoch in range(num_epochs):
        print('{} | Epoch {}'.format(dt.datetime.now(), epoch))
        model.train()
        k, d = centroids.size()
        centroid_sums = torch.zeros_like(centroids).to(current_device)
        centroid_counts = torch.zeros(k).to(current_device)
        total_epoch_loss = 0

        # run one epoch of gradient descent on autoencoders wrt centroids
        for i, (tokens, labels, flagged_indices) in tqdm(enumerate(train_loader)):
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)

            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids.detach())

            # run update step
            optimizer.zero_grad()
            cluster_loss.backward()
            optimizer.step()
            
            #Add loss to the epoch loss
            total_epoch_loss += cluster_loss.data

            # store centroid sums and counts in memory for later centering
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments, sentence_embed)

            if i % print_every == 0:
                losses = cluster_loss.data/len(tokens)
                print('Average training loss at batch ',i,': %.3f' % losses)
            
        total_epoch_loss /= len(train_loader.dataset)
        train_losses.append(total_epoch_loss)
        print('Average training loss after epoch ',epoch,': %.3f' % total_epoch_loss)
        
        # update centroids based on assignments from autoencoders
        centroids = centroid_sums / (centroid_counts[:, None] + 1).to(current_device)
        
        # calculate validation loss after every epoch
        total_validation_loss = 0
        for i, (tokens, labels, flagged_indices) in enumerate(valid_loader):
            model.eval()
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            #Add loss to the validation loss
            total_validation_loss += cluster_loss.data

        total_validation_loss /= len(valid_loader.dataset)
        val_losses.append(total_validation_loss)
        print('Average validation loss after epoch ',epoch,': %.3f' % total_validation_loss)
        
        if path_to_save == None:
            pass
        else:
            torch.save(model.state_dict(), path_to_save+'_dict_epoch'+str(epoch)+'.pt')
            torch.save(centroids, path_to_save+'_centroids_epoch'+str(epoch))
            torch.save(train_losses, path_to_save+'_train_losses')
            torch.save(val_losses, path_to_save+'_val_losses')
        
    return model, centroids, train_losses, val_losses

In [133]:
opts = {
    'vocab_size': 20000,
    'emb_dim': 512
}

In [134]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

model = neuralNetBow(opts['vocab_size'], opts['emb_dim']).to(current_device)

In [135]:
# model = neuralNetBow(opts['vocab_size'], opts['emb_dim'])
centroids = centroid_init(2, opts['emb_dim'],ground_truth_dataloader, model, current_device)
criterion = KMeansCriterion(1).to(current_device)
optimizer = torch.optim.Adam(model.parameters(), 0.01, amsgrad=True)

In [136]:
centroids

tensor([[-0.5393, -0.5985,  0.1128,  ..., -1.5948, -0.8000, -0.5975],
        [-0.5791, -0.5462,  0.1542,  ..., -1.4489, -0.7182, -0.5470]],
       grad_fn=<CloneBackward>)

In [137]:
current_device

'cpu'

In [138]:
train_model(model, centroids, criterion, optimizer, train_loader, val_loader, num_epochs=5, path_to_save="baseline_model")

2019-11-13 18:15:20.566644 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 47.673


KeyboardInterrupt: 