In [5]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os

from generate_dataloaders import *

## Get Dataloaders

In [6]:
def get_dataloaders(train_filename,val_filename):
    path = os.getcwd()
    data_dir = path + '/data/'
    train_dataloader = pkl.load(open(data_dir + train_filename,'rb'))
    val_dataloader = pkl.load(open(data_dir + val_filename,'rb'))
    return train_dataloader,val_dataloader

In [7]:
train_loader,val_loader = get_dataloaders('train_dataloader.p','val_dataloader.p')

## Scratchwork (IGNORE)

In [38]:
# for i, x in enumerate(train_loader):
#     print(x.shape)
#     print(x)
#     break

In [85]:
minibatch = torch.tensor([
                            [[1,2,3,4,5],[3,3,3,3,3],[1,1,1,1,1],[2,1,2,1,2]],
                            [[0,1,0,1,0],[1,1,1,1,1],[2,0,0,0,0],[0,0,0,0,2]]
                         ], dtype=torch.float32)

flagged_indices = torch.tensor([1,2])

upweight_value = 10

print(minibatch.shape)
print(minibatch)

print(flagged_indices.shape)
print(flagged_indices)

<class 'torch.Tensor'>
torch.Size([2, 4, 5])
tensor([[[1., 2., 3., 4., 5.],
         [3., 3., 3., 3., 3.],
         [1., 1., 1., 1., 1.],
         [2., 1., 2., 1., 2.]],

        [[0., 1., 0., 1., 0.],
         [1., 1., 1., 1., 1.],
         [2., 0., 0., 0., 0.],
         [0., 0., 0., 0., 2.]]])
torch.Size([2])
tensor([1, 2])


In [95]:
batch_size, num_tokens, emb_dim = minibatch.shape
print(type(minibatch))
minibatch[range(batch_size),flagged_indices,:] *= upweight_value
print(batch_size, num_tokens, emb_dim)
minibatch

<class 'torch.Tensor'>
2 4 5


tensor([[[1.0000e+00, 2.0000e+00, 3.0000e+00, 4.0000e+00, 5.0000e+00],
         [3.0000e+06, 3.0000e+06, 3.0000e+06, 3.0000e+06, 3.0000e+06],
         [1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00],
         [2.0000e+00, 1.0000e+00, 2.0000e+00, 1.0000e+00, 2.0000e+00]],

        [[0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00],
         [1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00],
         [2.0000e+06, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.0000e+00]]])

In [65]:
minibatch.sum(1) / (num_tokens + upweight_value - 1)

tensor([[2.6154, 2.6154, 2.7692, 2.7692, 2.9231],
        [1.6154, 0.1538, 0.0769, 0.1538, 0.2308]])

In [92]:
print(type(minibatch))

<class 'torch.Tensor'>


## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [73]:
class neuralNetBow(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, vocab_size, emb_dim, upweight=10):
        super(neuralNetBow, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=2)
        self.upweight = upweight
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
        # print(embedding.shape) # below assumes "batch_size x num_tokens" (VERIFY)
        
        # upweight by flagged_index
        print(type(embedding))
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff (un-tailored)

In [74]:
class KMeansCriterion(nn.Module):
    
    def __init__(self, lmbda):
        super().__init__()
        self.lmbda = lmbda
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.max(1)
        loss = self.lmbda * cluster_distances.sum()
        return loss, cluster_assignments

In [75]:
def centroid_init(k, d):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = Variable(torch.zeros(k, d))
    centroid_counts = Variable(torch.zeros(k))
    for X in trainloader:
        X_var = Variable(X)
        cluster_assignments = Variable(torch.LongT ensor(X.size(0)).random_(k))
        embeddings = encoder(X_var)
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, embeddings)
    
    centroid_means = centroid_sums / centroid_counts[:, None]
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)
    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    np_counts = np.bincount(cluster_assignments.data.numpy(), minlength=k)
    centroid_counts.add_(Variable(torch.FloatTensor(np_counts)))

SyntaxError: invalid syntax (<ipython-input-75-19b691d91e82>, line 7)

### Training Function (un-tailored, needs alterations)

In [76]:
#def train_model(model, centroids, criterion, optimizer, train_loader, valid_loader, num_epochs=10, path_to_save=path+'baseline_model.pt')
def train_model(model, train_loader, valid_loader, num_epochs=10):

#     k, d = centroids.size()
#     centroid_sums = torch.zeros_like(centroids)
#     centroid_counts = Variable(torch.zeros(k))
    
    # run one epoch of gradient descent on autoencoders wrt centroids
    for i, (tokens, flagged_indices, problematics) in enumerate(train_loader):
        
        print(tokens.shape)
        print(flagged_indices.shape)
        print(problematics.shape)
        
        sentence_embed = model(tokens,flagged_indices)
        print(sentence_embed)
        break
        
        # forward pass and compute loss
        X_var = Variable(X)
        embeddings = encoder(X_var)
        X_hat = decoder(embeddings)
        recon_loss = F.mse_loss(X_hat, X_var)
        cluster_loss, cluster_assignments = criterion(embeddings, centroids)
        loss = recon_loss + cluster_loss
        
        # run update step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        # store centroid sums and counts in memory for later centering
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, embeddings)
        
        if verbose and i % print_every == 0:
            batch_hat = autoencoder(Variable(batch))
            plot_batch(batch_hat.data)
            losses = (loss.data[0], recon_loss.data[0], cluster_loss.data[0])
            print('Trn Loss: %.3f [Recon Loss %.3f, Cluster Loss %.3f]' % losses)
        
    # update centroids based on assignments from autoencoders
#     centroid_means = centroid_sums / (centroid_counts[:, None] + 1)
#     return centroid_means, centroid_counts

In [77]:
opts = {
    'vocab_size': 20000,
    'emb_dim': 512
}

In [78]:
model = neuralNetBow(opts['vocab_size'], opts['emb_dim'])
#centroids = centroid_init(2, opts['emb_dim'])
criterion = KMeansCriterion(1)
#optimizer = torch.optim.Adam(chat_model.parameters(), 0.01, amsgrad=True)

In [79]:
train_model(model, train_loader, val_loader)

torch.Size([32, 30])
torch.Size([32])
torch.Size([32])
<class 'torch.Tensor'>
tensor([[ 0.0876, -0.1145, -0.0112,  ..., -0.2560, -0.0780,  0.6067],
        [ 0.1431, -0.5604,  0.0333,  ...,  0.7312, -0.4666,  0.3559],
        [ 0.0825, -0.4727,  0.1372,  ...,  0.4152, -0.3197,  0.3115],
        ...,
        [-0.1941,  0.2782, -0.0242,  ...,  0.2799, -0.0813, -0.2174],
        [ 0.2576, -0.3327,  0.1966,  ...,  0.2630, -0.6485,  0.3384],
        [-0.6450, -0.3033,  0.0216,  ..., -0.0927,  0.1854, -0.1138]],
       grad_fn=<DivBackward0>)


In [84]:
x = torch.tensor([2])
print(type(x))

<class 'torch.Tensor'>


In [83]:
x = torch.Tensor([2])
print(type(x))

<class 'torch.Tensor'>
