In [1]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import datetime
import spacy
import sklearn

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

In [2]:
# f = open('../../data/sentiment/positive')
# pos = f.read()
# f.close()

# f = open('../../data/sentiment/negative')
# neg = f.read()
# f.close()

In [2]:
# Load Spacy word embeddings
word_embeddings = spacy.load('en', vectors='glove.6B.300d.txt')

In [4]:
# Create a function to get vector format data for a sequence
def sequence_to_data(seq, max_len=None):
    seq = unicode(seq)
    data = [word_embeddings(ix).vector for ix in seq.split()]
    
    if max_len is None:
        max_len = len(data)
    
    data_mat = np.zeros((1, max_len, 300))
    
    for ix in range(min(len(data), max_len)):
        data_mat[:, ix, :] = data[ix]
    
    return data_mat

def seq_data_matrix(seq_data, max_len=None):
    n_seq = len(seq_data)
    data = np.concatenate([sequence_to_data(ix, max_len) for ix in seq_data], axis=0)
    
    return data
    
q = sequence_to_data(u'hello! what is the date today?', 100)
print q.shape

(1, 100, 300)


In [5]:
# df = pd.DataFrame([], columns=['text', 'score'])
# for ix in pos.split('\n'):
#     text = ix.strip().lower()
#     if len(text) > 1:
#         df = df.append({'text': text, 'score': 1}, ignore_index=True)
#     # print sequence_to_data(ix.strip().lower()).shape

# for ix in neg.split('\n'):
#     text = ix.strip().lower()
#     if len(text) > 1:
#         df = df.append({'text': text, 'score': 0}, ignore_index=True)

In [5]:
# df = sklearn.utils.shuffle(df).reset_index(drop=True)
df = pd.read_csv('../../data/sentiment/dataset.csv', sep='|', index_col=0)

In [6]:
df.head()

Unnamed: 0,text,score
0,a gushy episode of m a s h only this time...,0.0
1,the storys pathetic and the gags are puerile,0.0
2,not only a comingofage story and cautionary pa...,1.0
3,beyond a handful of mildly amusing lines th...,0.0
4,a complex psychological drama about a father w...,1.0


In [10]:
a = pd.DataFrame([], columns=['x'])

for ix in range(100):
    a = a.append({'x': ix}, ignore_index=True)

In [12]:
a['x_sq'] = a.x.apply(lambda x: x**2)
a.head()

Unnamed: 0,x,x_sq
0,0.0,0.0
1,1.0,1.0
2,2.0,4.0
3,3.0,9.0
4,4.0,16.0


In [7]:
df['len'] = df['text'].str.split().apply(lambda x: len(x))
# df = df.sort_index(ascending=False).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,text,score,len
0,a gushy episode of m a s h only this time...,0.0,15
1,the storys pathetic and the gags are puerile,0.0,8
2,not only a comingofage story and cautionary pa...,1.0,15
3,beyond a handful of mildly amusing lines th...,0.0,14
4,a complex psychological drama about a father w...,1.0,16


In [10]:
# df.to_csv('../../data/sentiment/dataset.csv', sep='|')

In [13]:
bucket_sizes = [[0, 10], [10, 15], [15, 20], [20, 25], [25, 45]]

def assign_bucket(x):
    for bucket in bucket_sizes:
        if x > bucket[0] and x <= bucket[1]:
            return bucket_sizes.index(bucket)
    return len(bucket_sizes)-1

In [14]:
df['bucket'] = df.len.apply(assign_bucket)
df.head()

Unnamed: 0,text,score,len,bucket
0,a gushy episode of m a s h only this time...,0.0,15,1
1,the storys pathetic and the gags are puerile,0.0,8,0
2,not only a comingofage story and cautionary pa...,1.0,15,1
3,beyond a handful of mildly amusing lines th...,0.0,14,1
4,a complex psychological drama about a father w...,1.0,16,2


In [16]:
df = df.sort(columns=['bucket'])
df.head()

  if __name__ == '__main__':


Unnamed: 0,text,score,len,bucket
9136,a brisk reverent and subtly different sequel,1.0,7,0
1777,too silly to take seriously,0.0,5,0
6700,the date movie that franz kafka would have made,1.0,9,0
3929,an inexperienced director mehta has much to l...,0.0,8,0
8836,an uneven mix of dark satire and childhood awa...,0.0,9,0


In [22]:
df[(df.bucket == 0)]

Unnamed: 0,text,score,len,bucket
9136,a brisk reverent and subtly different sequel,1.0,7,0
1777,too silly to take seriously,0.0,5,0
6700,the date movie that franz kafka would have made,1.0,9,0
3929,an inexperienced director mehta has much to l...,0.0,8,0
8836,an uneven mix of dark satire and childhood awa...,0.0,9,0
1789,smothered by its own solemnity,0.0,5,0
8832,an absorbing documentary,1.0,3,0
3921,it took 19 predecessors to get this,1.0,7,0
8828,reggios continual visual barrage is absorbing ...,1.0,10,0
8825,a stylistic romp thats always fun to watch,1.0,8,0


In [23]:
def make_batch(data, batch_size=10, gpu=True):
    for bx in range(len(bucket_sizes)):
        bucket_data = df[(df.bucket == bx)].reset_index(drop=True)
        # print bx, bucket_sizes[bx][1], bucket_data.shape
        
        start = 0
        stop = start + batch_size
        
        while start < bucket_data.shape[0]:
            seq_length = bucket_sizes[bx][1]
            section = bucket_data[start:stop]
            X_data = seq_data_matrix(section.text, max_len=seq_length)
            y_data = section.score
            
            if gpu:
                yield Variable(torch.FloatTensor(X_data).cuda(), requires_grad=True), Variable(torch.LongTensor(y_data)).cuda()
            else:
                yield Variable(torch.FloatTensor(X_data), requires_grad=True), Variable(torch.LongTensor(y_data))
            
            start = stop
            stop = start + batch_size

In [24]:
for ix, iy in make_batch(df, batch_size=1000):
    print ix.shape, iy.shape

torch.Size([1000, 10, 300]) torch.Size([1000])
torch.Size([1000, 10, 300]) torch.Size([1000])
torch.Size([49, 10, 300]) torch.Size([49])
torch.Size([1000, 15, 300]) torch.Size([1000])
torch.Size([1000, 15, 300]) torch.Size([1000])
torch.Size([81, 15, 300]) torch.Size([81])
torch.Size([1000, 20, 300]) torch.Size([1000])
torch.Size([1000, 20, 300]) torch.Size([1000])
torch.Size([333, 20, 300]) torch.Size([333])
torch.Size([1000, 25, 300]) torch.Size([1000])
torch.Size([968, 25, 300]) torch.Size([968])
torch.Size([1000, 45, 300]) torch.Size([1000])
torch.Size([1000, 45, 300]) torch.Size([1000])
torch.Size([231, 45, 300]) torch.Size([231])


In [16]:
# df.head(10)
# Printing colored text (Useful later)
# print colored("hello red world", 'blue')# print 'a'

In [26]:
class SeqModel(nn.Module):
    def __init__(self, in_shape=None, out_shape=None, hidden_shape=None):
        super(SeqModel, self).__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden_shape = hidden_shape
        self.n_layers = 1
        
        self.rnn = nn.LSTM(
            input_size=self.in_shape,
            hidden_size=self.hidden_shape,
            num_layers=self.n_layers,
            batch_first=True
        )
        self.lin = nn.Linear(self.hidden_shape, 64)
        self.dropout = nn.Dropout(0.42)
        self.out = nn.Linear(64, self.out_shape)
    
    def forward(self, x, h):
        r_out, h_state = self.rnn(x, h)
        last_out = r_out[:, -1, :]
        y = F.tanh(self.lin(last_out))
        y = self.dropout(y)
        y = F.softmax(self.out(y))
        return y
    
    def predict(self, x):
        h_state = self.init_hidden(1, gpu=False)
        
        x = sequence_to_data(x)
        pred = self.forward(torch.FloatTensor(x), h_state)
        
        return pred
    
    def get_embedding(self, x):
        h_state = self.init_hidden(1, gpu=False)
        
        x = sequence_to_data(x)
        r_out, h = self.rnn(torch.FloatTensor(x), h_state)
        last_out = r_out[:, -1, :]
        
        return last_out.data.numpy()
            
    def init_hidden(self, batch_size, gpu=True):
        if gpu:
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape).cuda()),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)).cuda())
        return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)),
                Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)))

In [48]:
model = SeqModel(in_shape=300, hidden_shape=256, out_shape=2)

print model
model.cuda()

SeqModel(
  (rnn): LSTM(300, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


SeqModel(
  (rnn): LSTM(300, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)

In [49]:
# model.predict('hello bad world')

# Load the model
model.load_state_dict(torch.load('/home/shubham/all_projects/CB/Summer_2018/data/checkpoints/seq_lstm_bucket/model_256h_epoch_700.ckpt'))

In [34]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
criterion = nn.CrossEntropyLoss()

In [35]:
# Set to train mode
# model.cuda()
model.train()

for epoch in range(50):
    total_loss = 0
    N = 0
    for step, (b_x, b_y) in enumerate(make_batch(df, batch_size=200)):
        # print step, b_x.shape, b_y.shape
        bsize = b_x.size(0)
        
        h_state = model.init_hidden(bsize, gpu=True)

        pred = model(b_x, h_state)
        loss = criterion(pred, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss
        N += 1.0
        if step%20 == 0:
            print 'Loss: {} at Epoch: {} | Step: {}'.format(loss, epoch, step)
        
    print "Overall Average Loss: {} at Epoch: {}".format(total_loss / float(N), epoch)
    
    # Save model checkpoints
    if epoch % 10 == 0:
        torch.save(model.state_dict(), "/home/shubham/all_projects/CB/Summer_2018/data/checkpoints/seq_lstm_bucket/model_256h_epoch_{}.ckpt".format(epoch))



Loss: 0.69742333889 at Epoch: 0 | Step: 0
Loss: 0.69163531065 at Epoch: 0 | Step: 20
Loss: 0.692053377628 at Epoch: 0 | Step: 40
Overall Average Loss: 0.694422006607 at Epoch: 0
Loss: 0.692090451717 at Epoch: 1 | Step: 0
Loss: 0.687447428703 at Epoch: 1 | Step: 20
Loss: 0.689199209213 at Epoch: 1 | Step: 40
Overall Average Loss: 0.689712703228 at Epoch: 1
Loss: 0.681926727295 at Epoch: 2 | Step: 0
Loss: 0.668727934361 at Epoch: 2 | Step: 20
Loss: 0.647350132465 at Epoch: 2 | Step: 40
Overall Average Loss: 0.674889683723 at Epoch: 2
Loss: 0.63601154089 at Epoch: 3 | Step: 0
Loss: 0.608234405518 at Epoch: 3 | Step: 20
Loss: 0.593071639538 at Epoch: 3 | Step: 40
Overall Average Loss: 0.638657689095 at Epoch: 3
Loss: 0.577009916306 at Epoch: 4 | Step: 0
Loss: 0.578892111778 at Epoch: 4 | Step: 20
Loss: 0.581489622593 at Epoch: 4 | Step: 40
Overall Average Loss: 0.612688839436 at Epoch: 4
Loss: 0.549013614655 at Epoch: 5 | Step: 0
Loss: 0.570038259029 at Epoch: 5 | Step: 20
Loss: 0.57555544

In [50]:
# Make ppredictions
model.eval()
model.cpu()

SeqModel(
  (rnn): LSTM(300, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)

In [51]:
model.predict('I am going to some place to take play a game')



tensor([[ 1.0000e+00,  1.3468e-15]])

In [29]:
import sklearn.metrics

In [30]:
model.cpu()

v1 = model.get_embedding('I am going to a place')
v2 = model.get_embedding('I am not going')
print v1.shape, v2.shape

(1, 256) (1, 256)


In [31]:
sklearn.metrics.pairwise.cosine_distances(v1, v2)

array([[0.7183737]], dtype=float32)