In [1]:
from shutil import copyfile
copyfile(src="../input/inputs/generate_dataloaders.py", dst="../working/generate_dataloaders.py")
copyfile(src="../input/inputs/train_dataloader.p", dst="../working/train_dataloader.p")
copyfile(src="../input/inputs/val_dataloader.p", dst="../working/val_dataloader.p")
copyfile(src="../input/inputs/centroids_dataloader.p", dst="../working/ground_truth_dataloader.p")
copyfile(src="../input/inputs/dictionary.p", dst="../working/dictionary.p")

'../working/dictionary.p'

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt
import pandas as pd
import random

from generate_dataloaders import *

from tqdm import tqdm_notebook as tqdm

## Get Dataloaders

In [3]:
def get_dataloaders(train_filename,val_filename):
    path = os.getcwd()
    ## data_dir = path + '/data/' #Uncomment for local system
    data_dir = path + '/'
    train_dataloader = pkl.load(open(data_dir + train_filename,'rb'))
    val_dataloader = pkl.load(open(data_dir + val_filename,'rb'))
    return train_dataloader,val_dataloader

In [4]:
seed = 1029
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))

In [5]:
path = os.getcwd()
data_dir = path + '/'
# data_dir = path +'/data/' #Uncomment for local system

In [6]:
train_loader,val_loader = get_dataloaders('train_dataloader.p','val_dataloader.p')

In [7]:
ground_truth_dataloader = pkl.load(open(data_dir + 'ground_truth_dataloader.p','rb'))

In [8]:
review_dict = pkl.load(open(data_dir + 'dictionary.p','rb'))

In [9]:
#%conda install pytorch torchvision -c pytorch
## if torch.__version__ is not 1.3.1, run this cell then restart kernel

In [10]:
print(torch.__version__)

1.3.0


## Scratchwork (IGNORE)

In [11]:
for i,x in enumerate(train_loader):
    print(len(x[0]))
    break

32


In [12]:
minibatch = torch.tensor([
                            [[1,2,3,4,5],[3,3,3,3,3],[1,1,1,1,1],[2,1,2,1,2]],
                            [[0,1,0,1,0],[1,1,1,1,1],[2,0,0,0,0],[0,0,0,0,2]]
                         ], dtype=torch.float32)

flagged_indices = torch.tensor([1,2])

upweight_value = 10

print(minibatch.shape)
print(minibatch)

print(flagged_indices.shape)
print(flagged_indices)

torch.Size([2, 4, 5])
tensor([[[1., 2., 3., 4., 5.],
         [3., 3., 3., 3., 3.],
         [1., 1., 1., 1., 1.],
         [2., 1., 2., 1., 2.]],

        [[0., 1., 0., 1., 0.],
         [1., 1., 1., 1., 1.],
         [2., 0., 0., 0., 0.],
         [0., 0., 0., 0., 2.]]])
torch.Size([2])
tensor([1, 2])


In [13]:
batch_size, num_tokens, emb_dim = minibatch.shape
print(type(minibatch))
minibatch[range(batch_size),flagged_indices,:] *= upweight_value
print(batch_size, num_tokens, emb_dim)
minibatch

<class 'torch.Tensor'>
2 4 5


tensor([[[ 1.,  2.,  3.,  4.,  5.],
         [30., 30., 30., 30., 30.],
         [ 1.,  1.,  1.,  1.,  1.],
         [ 2.,  1.,  2.,  1.,  2.]],

        [[ 0.,  1.,  0.,  1.,  0.],
         [ 1.,  1.,  1.,  1.,  1.],
         [20.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  2.]]])

In [14]:
minibatch.sum(1) / (num_tokens + upweight_value - 1)

tensor([[2.6154, 2.6154, 2.7692, 2.7692, 2.9231],
        [1.6154, 0.1538, 0.0769, 0.1538, 0.2308]])

In [15]:
print(type(minibatch))

<class 'torch.Tensor'>


In [16]:
embed = torch.tensor(np.array([[2,4,5,6],[1,3,45,7],[3,4,5,6]]))

In [17]:
centers = torch.tensor(np.array(([2,3,4,5],[1,2,4,5])))

In [18]:
torch.sum((embed[:,None,:]-centers)**2,2)

tensor([[   3,    7],
        [1686, 1686],
        [   4,   10]])

In [19]:
cluster_distances, cluster_assignments = torch.sum((embed[:,None,:]-centers)**2, 2).min(1)
cluster_assignments

tensor([0, 1, 0])

In [20]:
for i, (tokens, labels, flagged_indices) in enumerate(train_loader):
    #print(tokens, labels, flagged_indices)
    break

In [21]:
cluster_assts = torch.LongTensor([1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 0, 0, 0, 1])
k = 2
bin_counts = torch.bincount(cluster_assts,minlength=k)

In [22]:
bin_counts = bin_counts.type(torch.FloatTensor).to(current_device)
bin_counts

NameError: name 'current_device' is not defined

## PRE TRAINED WORD EMBEDDINGS 

In [23]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float16')

In [24]:
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

In [25]:
def build_matrix(review_dict, embedding_index ,dim = 200):
#     embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(review_dict.tokens), dim))
    unknown_words = []
    
    for word, i in review_dict.ids.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [26]:
glove_twitter = '../input/glove-global-vectors-for-word-representation/glove.twitter.27B.200d.txt' #Change loc for local system

In [27]:
embedding_index = load_embeddings(glove_twitter)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [28]:
glove_embedding_index,unknown_words = build_matrix(review_dict, embedding_index)
del embedding_index

In [29]:
len(review_dict.tokens)

16256

In [31]:
len(unknown_words)

4428

In [30]:
for word in unknown_words:
    print(word)

<pad>
<unk>
job-
seller..always
...
7
1-2
a++
3am3i
qaiser
raven09
liudmila
ksenka
venanzio
3d
dialux
asael
galland21
spoonlancer
timescales
subsiststudios
anoob
1st
ana-maria
2
3
yiga
chriistalrock
4
jeegy
100
5*
99designs
bossall691
charni
dark-o
ocelittle
99design
.extremely
micromanagement
firebase
redlogo
6th
work..
betiobca
qaulity
time-crunch
datafox
saimun
zunair
vtiger
sign.yra
laurra
anjul
bananodromo
katie_kat
a+++
umme
karlatut
susual
you..
after..
designs-
dmytro
hand-drawn/sketched/painted
re-designing
hourlies
2nd
bmdseoprovider
donmare
wintrygrey
a+++++
feedback/advice
prismonline
micro-alloying
time.y
lemoor
luthfun
eyecatching
reesee
vinith
ps_design
accomodating
creativeinfoway
fritzr
rosko
olgo
ginder
07
2018
02
andrewzz
exceded
gourav
experioence
dependably
maeastro
torvs
dreammaster
..
5
20139gph
thanks..
propositive
severak
expreience
brthr-ed
lucadia
krlegend
professionist
4th
-yes
sadafs
22
ateendras
standad
rziko1
re-hiring
igreg
12
shafayat
esolz
zenefasions


milikat
bossall
zhu2hui
topdraw
skeuomorphic
.not
shepper
r28
šaška
job/s
creavtivity
peopleperhour.com
ruxi
zoomix
photographer..
budget..
future..
firoza
devloped
..he
rodaina
extreemly
scriptlancer
❤️❤️❤️
fromnpankajj
mituka
mac..thanks
take-offs
whitehorsetechn
anyone.thank
satisfied..great
pritesh
heartdesire
www.standardstalk.com
mikeslusive
co-active
dezven
communicaton
jaswinder
venupriya
arvice
diffidently
holiday26
accomodated
awayws
aymdesigns
artisitc
o/p
transparrent
overexpected
lemonetea
hetal
freelancer.successful
shareens
futire
follows-up
vincentag
whiteknight
a++++++++++++
rbm777
ghafoor
quick/prompt
-d
perfrectly
konny
nishu
wildchords
arquimedia
zabir
impinge
dragona99
services..check
www.tribalmuse.net
shoiab
lalith
corrections/changes
concept/message
okdesignstudio
morshed
veyko
assitance
communictaion
comforatable
mmarif1982
initivie
natcym
creativity..
zornista
him..he
calafia
-and
manner-
kavish
esp8266
hrvoje
chebanu
6-9
'smart
argim
uneekin
.again
pleased-
t

shwete
expereince
illlustrator
done..
again1
change/tweak
behond
-good
hesistate
panameralab
re-engage
cohesiveness
rekommanderet
do..would
abderrafi
deliverred
calm-
definiely
brinkal
microcontrollers
comunicating
deivery
2012
geoff..
year..
concept/design
suggestions..
sulinder
karlatuts
plus-
'just
usual1
bharagav
repectful
sibayan
described..
proactiveness
deilivered
noorul
wrealy
notice..
electronical
proffessionaly
phuoc
mariellik
yuryk
yashpal
swapnil
deigns
ploose
iva_
quivk
quickly..
designer..amazing
do.and
madeureka
shopiin
gcart
bodhan
designs/presence
fabulous.a
industriousness
abebooks
'cut
daelinzan
warnaihari
janeal
asadollah
well-thought-out
com-up
spiritual/holistic
next..
clarifcations
5v
12v
piggyand
gergana
ariadni
accomidate
++++brthr-ed++++
selller
meryofttheangels77
..we
stefaan
great-unique
10pm
-just
vitalina
suvadip
huffingtonpost.de
samiku
absulotly
'added
issues-related
agniezka
freeelancer
overall-
borislav
rapidity
transcriber
..will
n/a
coltp
liaise
perf

## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [85]:
class neuralNetBow_glove(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, embedding_matrix, upweight=10):
        super(neuralNetBow_glove, self).__init__()
        vocab_size = embedding_matrix.shape[0]
        embed_size = embedding_matrix.shape[1]
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=2)
        
        self.embed.weight = nn.Parameter(torch.tensor(embedding_matrix,
                                                          dtype=torch.float32))
        self.embed.weight.requires_grad = False
        
        
        self.upweight = upweight
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
#         print(embedding.shape) # below assumes "batch_size x num_tokens x Emb_dim" (VERIFY)
        
        # upweight by flagged_index
#         print(type(embedding))
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff (un-tailored)

In [86]:
class KMeansCriterion(nn.Module):
    
    def __init__(self, lmbda):
        super().__init__()
        self.lmbda = lmbda
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.min(1)
        loss = self.lmbda * cluster_distances.sum()
        return loss, cluster_assignments

In [87]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

### Training Function (un-tailored, needs alterations)

In [88]:
def train_model(model, centroids, criterion, optimizer, train_loader, valid_loader, num_epochs=10, path_to_save=None, print_every = 1000):

    train_losses=[]
    val_losses=[]
    num_gpus = torch.cuda.device_count()
    if num_gpus > 0:
        current_device = 'cuda'
    else:
        current_device = 'cpu'
    
    for epoch in range(num_epochs):
        print('{} | Epoch {}'.format(dt.datetime.now(), epoch))
        model.train()
        k, d = centroids.size()
        centroid_sums = torch.zeros_like(centroids).to(current_device)
        centroid_counts = torch.zeros(k).to(current_device)
        total_epoch_loss = 0

        # run one epoch of gradient descent on autoencoders wrt centroids
        for i, (tokens, labels, flagged_indices) in tqdm(enumerate(train_loader)):
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)

            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids.detach())

            # run update step
            optimizer.zero_grad()
#             cluster_loss.backward()
            optimizer.step()
            
            #Add loss to the epoch loss
            total_epoch_loss += cluster_loss.data

            # store centroid sums and counts in memory for later centering
            update_clusters(centroid_sums, centroid_counts,
                            cluster_assignments, sentence_embed)

            if i % print_every == 0:
                losses = cluster_loss.data/len(tokens)
                print('Average training loss at batch ',i,': %.3f' % losses)
            
        total_epoch_loss /= len(train_loader.dataset)
        train_losses.append(total_epoch_loss)
        print('Average training loss after epoch ',epoch,': %.3f' % total_epoch_loss)
        
        # update centroids based on assignments from autoencoders
        centroids = centroid_sums / (centroid_counts[:, None] + 1).to(current_device)
        
        # calculate validation loss after every epoch
        total_validation_loss = 0
        for i, (tokens, labels, flagged_indices) in enumerate(valid_loader):
            model.eval()
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            #Add loss to the validation loss
            total_validation_loss += cluster_loss.data

        total_validation_loss /= len(valid_loader.dataset)
        val_losses.append(total_validation_loss)
        print('Average validation loss after epoch ',epoch,': %.3f' % total_validation_loss)
        
        if path_to_save == None:
            pass
        else:
            opts = {"vocab_size":model.vocab_size, "embed_size": model.embed_size}
            torch.save(model.state_dict(), path_to_save+'model_dict.pt')
            torch.save(centroids, path_to_save+'centroids')
            torch.save(train_losses, path_to_save+'train_losses')
            torch.save(val_losses, path_to_save+'val_losses')
            torch.save(opts, path_to_save+'opts')
            
        
    return model, centroids, train_losses, val_losses

In [89]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

model = neuralNetBow_glove(glove_embedding_index).to(current_device)

In [90]:
# model = neuralNetBow(opts['vocab_size'], opts['emb_dim'])
centroids = centroid_init(2, 200,ground_truth_dataloader, model, current_device)
criterion = KMeansCriterion(1).to(current_device)
optimizer = torch.optim.Adam(model.parameters(), 0.01, amsgrad=True)

In [91]:
centroids

tensor([[ 6.8292e-02,  1.0521e-02, -1.1470e-02,  7.2093e-02, -1.8306e-02,
          1.0145e-01,  2.8836e-01,  9.0186e-03, -8.0530e-02, -1.0073e-01,
         -8.2945e-02, -8.3735e-02, -3.7348e-01, -6.0185e-02,  5.7898e-04,
          6.9175e-02,  1.6187e-02,  7.5986e-03, -6.3130e-02, -2.7320e-02,
         -5.9389e-02,  8.5293e-03, -4.1944e-02,  4.8727e-02, -7.6512e-02,
          4.8027e-01, -3.7244e-02,  1.0092e-01,  1.3747e-01,  1.0604e-02,
         -2.9348e-02, -9.7453e-02, -4.3732e-02,  1.7378e-02,  1.1165e-01,
          9.2251e-02,  2.6641e-02, -3.0175e-02,  7.8488e-03, -1.6710e-02,
          2.3404e-01, -1.3387e-02,  4.8180e-02,  4.4078e-02, -1.7893e-02,
         -2.1284e-03,  7.7306e-02,  2.4613e-02, -6.8325e-02,  9.8534e-03,
          3.3565e-03,  1.9454e-02, -1.2342e-01, -2.6541e-03,  9.3597e-02,
          6.1619e-03,  9.5398e-03, -4.9843e-02, -1.7416e-02, -2.6598e-02,
         -1.4866e-02,  4.4059e-03, -9.9971e-02, -4.5498e-02,  5.4118e-02,
          7.1149e-02, -8.5065e-02, -3.

In [92]:
current_device

'cuda'

In [93]:
review_dict.get_id("the")

41

In [94]:
torch.tensor([41])

tensor([41])

In [95]:
model.embed(torch.tensor([41]).to(current_device))

tensor([[ 4.9341e-01,  3.5693e-01,  6.6064e-01, -3.2990e-02,  2.4988e-01,
          2.5928e-01, -2.7176e-02,  6.8420e-02, -2.9053e-01, -4.5703e-01,
         -7.7942e-02,  3.2520e-01, -1.4854e+00, -6.7444e-02, -1.7029e-01,
         -9.2926e-03,  3.4619e-01, -1.1574e-02,  3.7964e-02,  4.5605e-01,
          8.0505e-02,  1.5308e-01, -1.5308e-01, -1.8811e-01, -1.8201e-01,
          8.7256e-01,  3.9795e-01,  4.0991e-01,  4.4971e-01, -1.9646e-03,
         -4.1138e-02, -4.7882e-02, -2.4048e-01, -8.6853e-02,  1.4183e-02,
         -2.3755e-01,  2.5171e-01,  2.8540e-01,  4.4507e-01, -4.9634e-01,
         -1.2708e-01, -1.7480e-01,  8.2214e-02,  4.5410e-02,  5.1709e-01,
          3.4546e-02, -8.5815e-02, -3.4912e-01,  5.2197e-01, -3.9502e-01,
          6.4148e-02, -4.2017e-01, -1.5942e-01,  1.8286e-01, -5.7892e-02,
         -1.9180e-02, -4.4556e-01,  3.1543e-01, -1.6101e-01, -9.2163e-02,
         -2.4963e-01, -1.3895e-03, -4.2651e-01, -1.7932e-01,  8.1665e-02,
          1.8323e-01, -3.2056e-01, -1.

In [96]:
path = os.getcwd()
# model_dir = path + '/models/baseline_frozen_glove/' #Uncomment for local system
model_dir = ''

In [97]:
baseline_model, baseline_centroids, baseline_train_losses, baseline_val_losses = train_model(model, centroids, criterion, optimizer, train_loader, val_loader, num_epochs=5, path_to_save=model_dir + )baseline_model, baseline_centroids, baseline_train_losses, baseline_val_losses = train_model(model, centroids, criterion, optimizer, train_loader, val_loader, num_epochs=5, path_to_save=model_dir)

2019-11-17 02:41:09.734046 | Epoch 0


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.556
Average training loss at batch  1000 : 1.828
Average training loss at batch  2000 : 2.484
Average training loss at batch  3000 : 1.976

Average training loss after epoch  0 : 1.996
Average validation loss after epoch  0 : 1.599
2019-11-17 02:41:15.019407 | Epoch 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.650
Average training loss at batch  1000 : 1.876
Average training loss at batch  2000 : 1.458
Average training loss at batch  3000 : 1.909

Average training loss after epoch  1 : 1.753
Average validation loss after epoch  1 : 1.587
2019-11-17 02:41:20.297777 | Epoch 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.571
Average training loss at batch  1000 : 1.797
Average training loss at batch  2000 : 1.985
Average training loss at batch  3000 : 1.637

Average training loss after epoch  2 : 1.679
Average validation loss after epoch  2 : 1.583
2019-11-17 02:41:25.561449 | Epoch 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.343
Average training loss at batch  1000 : 1.567
Average training loss at batch  2000 : 1.575
Average training loss at batch  3000 : 1.651

Average training loss after epoch  3 : 1.653
Average validation loss after epoch  3 : 1.576
2019-11-17 02:41:30.911311 | Epoch 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Average training loss at batch  0 : 1.771
Average training loss at batch  1000 : 1.944
Average training loss at batch  2000 : 1.553
Average training loss at batch  3000 : 1.768

Average training loss after epoch  4 : 1.643
Average validation loss after epoch  4 : 1.571


In [98]:
#Only needed for Kaggle

from IPython.display import FileLink, FileLinks 
FileLinks('.') #lists all downloadable files on server