In [9]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
from collections import Counter
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc="Progress")

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [4]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
# train_df.question_text = train_df.question_text.progress_apply(lambda x: x.strip())

In [5]:
words = Counter()
for sent in tqdm(train_df.question_text.values):
    words.update(w.text.lower() for w in nlp(sent))

100%|██████████| 1306122/1306122 [01:52<00:00, 11635.04it/s]


In [6]:
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD', '_UNK'] + words

word2idx = {o:i for i, o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

def indexer(s):
    return [word2idx[w.text.lower()] for w in nlp(s)]

# train_df['question_idx'] = train_df.question_text.progress_apply(indexer)
# train_df['lengths'] = train_df.question_idx.progress_apply(len)

# fig = plt.figure(figsize=(8,5))
# ax = sns.distplot(train_df.lengths.values, kde=False);
# ax.set(xlabel='Question length', ylabel='Frequency')

In [7]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [8]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# class VectorizeData(Dataset):
#     def __init__(self, df_path):
#         self.df = pd.read_csv(df_path, error_bad_lines=False)
#         self.df['question_text'] = self.df.question_text.progress_apply(lambda x: x.strip())
#         self.df['question_idx'] = self.df.question_text.progress_apply(indexer)
    
#     def __len__(self):
#         return self.df.shape[0]
    
#     def __getitem__(self, idx):
#         X = self.df.question_idx[idx]
#         y = self.df.target[idx]
#         return X, y
    

# ds = VectorizeData('../input/train.csv')
# print(ds[:4])

In [9]:
d1 = DataLoader(dataset=ds, batch_size=3)
print('Total samples', len(d1))

Total samples 435374


In [10]:
it = iter(d1)
xs, ys = next(it)
print('length of smallest question', len(xs))
print(type(xs))
print(xs)

length of smallest question 12
<class 'list'>
[tensor([11, 12, 19]), tensor([55, 17, 28]), tensor([6650,   27, 2016]), tensor([7135,   35,  385]), tensor([ 167, 3872,   81]), tensor([ 65, 481,   2]), tensor([6106,   15,   28]), tensor([  42,   11, 2016]), tensor([  6,  40, 385]), tensor([1156,   17,  461]), tensor([   8, 3665, 5524]), tensor([ 3, 44,  2])]


In [10]:
class VectorizeData(Dataset):
    def __init__(self, df_path, maxlen=100):
        self.maxlen = maxlen
        self.df = pd.read_csv(df_path, error_bad_lines = False)
        self.df['question_text'] = self.df.question_text.progress_apply(lambda x: x.strip())
        print('Indexing...')
        self.df['question_idx'] = self.df.question_text.progress_apply(indexer)
        print('Calculating lengths')
        self.df['lengths'] = self.df.question_idx.progress_apply(lambda x: self.maxlen if len(x)>self.maxlen else len(x))
        print('Padding')
        self.df['question_padded'] = self.df.question_idx.progress_apply(self.pad_data)
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:]=s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        X = self.df.question_padded[idx]
        lens = self.df.lengths[idx]
        y = self.df.target[idx]
        return X, y, lens
    
ds = VectorizeData('../input/train.csv')
print(ds[:5])

Progress: 100%|██████████| 1306122/1306122 [00:01<00:00, 915833.86it/s]
Progress:   0%|          | 46/1306122 [00:00<47:19, 459.97it/s]

Indexing...


Progress: 100%|██████████| 1306122/1306122 [01:44<00:00, 12497.33it/s]
Progress:   5%|▌         | 65861/1306122 [00:00<00:01, 658609.06it/s]

Calculating lengths


Progress: 100%|██████████| 1306122/1306122 [00:01<00:00, 914422.88it/s] 
Progress:   1%|          | 13854/1306122 [00:00<00:09, 138536.83it/s]

Padding


Progress: 100%|██████████| 1306122/1306122 [00:05<00:00, 243123.36it/s]

(0    [11, 55, 6650, 7135, 167, 65, 6106, 42, 6, 115...
1    [12, 17, 27, 35, 3872, 481, 15, 11, 40, 17, 36...
2    [19, 28, 2016, 385, 81, 2, 28, 2016, 385, 461,...
3    [11, 55, 12687, 8277, 51252, 130, 3, 39440, 27...
4    [18, 10, 1134, 42451, 96201, 1368, 7, 6, 3137,...
Name: question_padded, dtype: object, 0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64, 0    14
1    18
2    12
3    10
4    16
Name: lengths, dtype: int64)





In [9]:
d1 = DataLoader(dataset=ds, batch_size=3)
print(len(d1))

435374


In [11]:
it = iter(d1)
xs, ys, lens = next(it)
print(type(xs))
print(xs)

<class 'torch.Tensor'>
tensor([[  11,   55, 6650, 7135,  167,   65, 6106,   42,    6, 1156,    8,    3,
         8597,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [  12,   17,   27,   35, 3872,  481,   15,   11,   40,   17, 3665,   44,
            7, 3102,   13,   51, 1858,    2,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,  

In [13]:
vocab_size = len(words)
embedding_dim = 4
n_hidden = 5
n_out = 2

class SimpleGru(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out):
        super().__init__()
        self.vocab_size, self.embedding_dim, self.n_hidden, self.n_out = vocab_size, embedding_dim, n_hidden, n_out
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden)
        self.out = nn.Linear(self.n_hidden, self.n_out)
    
    def forward(self, seq, lengths):
        bs = seq.size(1)
#         print('batch size', bs)
        self.h = self.init_hidden(bs) #initialize hidden state of GRU
#         print('Initial hidden state shape: ', self.h.shape)
        embs = self.emb(seq)
        embs = pack_padded_sequence(embs, lengths) #unpad
        gru_out, self.h = self.gru(embs, self.h) #gru returns hidden state of all timesteps and hidden state at last timestep
        gru_out, lengths = pad_packed_sequence(gru_out) #pad the sequence to the  max length in the batch
#         print('GRU output(all timesteps): ', gru_out.shape)
#         print(gru_out)
#         print('GRU last timestep output: ')
#         print(gru_out[-1])
#         print('Last hidden state: ', self.h)
        outp = self.out(self.h[-1])
        return F.log_softmax(outp, dim=-1)
        
    def init_hidden(self, batch_size):
        return Variable(torch.zeros((1, batch_size, self.n_hidden)).cuda())


In [49]:
model = SimpleGru(vocab_size, embedding_dim, n_hidden, n_out)
print(model)

SimpleGru(
  (emb): Embedding(219238, 4)
  (gru): GRU(4, 5)
  (out): Linear(in_features=5, out_features=2, bias=True)
)


In [14]:
def sort_batch(X, y, lengths):
    lengths, idx = lengths.sort(dim=0, descending=True)
    X= X[idx]
    y = y[idx]
    return X.transpose(0,1), y, lengths #transport (batch x seq_length) to (seq_length*batch)

# d1 = DataLoader(ds, batch_size=3)
# it = iter(d1)
# xs, ys, lens = next(it)
# xs, ys, lens = sort_batch(xs,ys,lens)
# outp = model(xs, lens.cpu().numpy())

In [16]:
outp

tensor([[-0.7900, -0.6049],
        [-0.7982, -0.5981],
        [-0.7954, -0.6004]], grad_fn=<LogSoftmaxBackward>)

In [17]:
torch.max(outp, dim=1)

(tensor([-0.6049, -0.5981, -0.6004], grad_fn=<MaxBackward0>),
 tensor([1, 1, 1]))

In [18]:
F.nll_loss(outp, Variable(ys))

tensor(0.7945, grad_fn=<NllLossBackward>)

In [15]:
def fit(model, train_dl, val_dl, loss_fn, opt, epochs=3):
    num_batch = len(train_dl)
    for epoch in tnrange(epochs):
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0
        
        if val_dl:
            y_true_val = list()
            y_pred_val = list()
            total_loss_val = 0
            
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        for X, y, lengths in t:
            t.set_description(f'Epoch {epoch}')
            X, y, lengths = sort_batch(X, y, lengths)
            X = Variable(X.cuda())
            y = Variable(y.cuda())
            lengths = lengths.numpy()
            
            opt.zero_grad()
            pred = model(X, lengths).cuda()
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            
            t.set_postfix(loss=loss.data)
            pred_idx = torch.max(pred, dim=1)[1]
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred_idx.cpu().data.numpy())
            total_loss_train += loss.data
            
        train_acc = accuracy_score(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        print(f'Epoch {epoch}: Train loss: {train_loss} acc: {train_acc}')
        
        if val_dl:
            for X, y, lengths in tqdm_notebook(val_dl, leave=False):
                X, y , lenghts = sort_batch(X, y, lengths)
                X = Variable(X.cuda())
                y = Variable(y.cuda())
                pred = model(X, lengths.numpy())
                loss = loss_fn(pred, y)
                pred_idx = torch.max(pred, 1)[1]
                y_true_val += list(y.cpu().data.numpy())
                y_pred_val += list(pred_idx.cpu().data.numpy())
                total_loss_val += loss.data[0]
            valacc = accuracy_score(y_true_val, y_pred_val)
            valloss = total_loss_val/len(val_dl)
            print(f'Val loss: {valloss} acc: {valacc}')
            

In [52]:
train_dl = DataLoader(ds, batch_size=512)
model = SimpleGru(vocab_size, embedding_dim, n_hidden, n_out).cuda()
opt = optim.Adam(model.parameters(), 1e-2)

In [53]:
fit(model=model, train_dl=train_dl, val_dl=None, loss_fn=F.nll_loss, opt=opt, epochs=4)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 0: Train loss: 0.12927229702472687 acc: 0.9515627177246843


HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 1: Train loss: 0.10469355434179306 acc: 0.9575453135312015


HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 2: Train loss: 0.09778326004743576 acc: 0.9600427831397067


HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 3: Train loss: 0.0924949049949646 acc: 0.9621375338597773


In [60]:
torch.save(model.state_dict(), 'simple_gru.pt')

In [16]:
class ConcatPoolingGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out):
        super().__init__()
        self.vocab_size, self.embedding_dim, self.n_hidden, self.n_out = vocab_size, embedding_dim, n_hidden, n_out
        self.emb=nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden)
        self.out = nn.Linear(self.n_hidden*3, self.n_out)
        
    def forward(self, seq, lengths):
        self.h = self.init_hidden(seq.size(1), gpu)
        embs = self.emb(seq)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(seq.size(1), -1)
        print('adaptive avg pooling', avg_pool)
        avg_pool_byhand = torch.sum(gru.out, dim=0)/Variable(torch.FloatTensor(lengths).view(-1, 1))
        print('By Hand adaptive avg pooling', avg_pool_byhand)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(sequ.size(1), -1)
        print('adative max pooling', max_pool)
        max_pool_byhand = torch.cat([torch.max(i[:l], dim=0)[0].view(1,-1) for i, l in zip(gru_out.permute(1,0,2), lengths)], dim=0)
        print('by hand adaptive max pooling', max_pool_byhand)
        outp = self.out(torch.cat([self.h[-1], avg_pool_byhand, max_pool_byhand], dim=1))
        return F.log_softmax(outp, dim=-1)
    
    def init_hidden(self, batch_size, gpu):
        Variable(torch.zeros((1, batch_size, self.n_hidden)).cuda())
        

In [55]:
model = ConcatPoolingGRU(vocab_size, embedding_dim, n_hidden, n_out)
print(model)

ConcatPoolingGRU(
  (emb): Embedding(219238, 4)
  (gru): GRU(4, 5)
  (out): Linear(in_features=15, out_features=2, bias=True)
)


In [57]:
dl = DataLoader(ds, batch_size=3)
it = iter(dl)
xs, ys, lens = next(it)

In [59]:
xs, ys,lens = sort_batch(xs, ys, lens)
outp = model(xs, lens.cpu().numpy())

NameError: name 'gpu' is not defined

In [33]:
class ConcatPoolingGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out):
        super().__init__()
        self.vocab_size, self.embedding_dim, self.n_hidden, self.n_out = vocab_size, embedding_dim, n_hidden, n_out
        self.emb=nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden)
        self.out = nn.Linear(self.n_hidden*3, self.n_out)
        
    def forward(self, seq, lengths):
        self.h = self.init_hidden(seq.size(1))
        embs = self.emb(seq)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(seq.size(1), -1)
#         print('adaptive avg pooling', avg_pool)
#         avg_pool_byhand = torch.FloatTensor(torch.sum(gru_out, dim=0))/Variable(torch.FloatTensor(lengths).view(-1, 1))
#         print('By Hand adaptive avg pooling', avg_pool_byhand)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(seq.size(1), -1)
#         print('adative max pooling', max_pool)
#         max_pool_byhand = torch.cat([torch.max(i[:l], dim=0)[0].view(1,-1) for i, l in zip(gru_out.permute(1,0,2), lengths)], dim=0)
#         print('by hand adaptive max pooling', max_pool_byhand)
        outp = self.out(torch.cat([self.h[-1], avg_pool, max_pool], dim=1))
        return F.log_softmax(outp, dim=-1)
    
    def init_hidden(self, batch_size):
        Variable(torch.zeros((1, batch_size, self.n_hidden)).cuda())

In [34]:
train_dl = DataLoader(ds, batch_size=512)
model = ConcatPoolingGRU(vocab_size, embedding_dim, n_hidden, n_out).cuda()
opt = optim.Adam(model.parameters(), 1e-2)
fit(model=model, train_dl = train_dl, val_dl =None, loss_fn =F.nll_loss, opt=opt, epochs=4)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 0: Train loss: 0.1321483850479126 acc: 0.9498331702551522


HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 1: Train loss: 0.10505302995443344 acc: 0.9573163915775096


HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 2: Train loss: 0.09773273020982742 acc: 0.959960861236546


HBox(children=(IntProgress(value=0, max=2552), HTML(value='')))

Epoch 3: Train loss: 0.09225675463676453 acc: 0.9622064401334638
