In [2]:
%matplotlib inline
import os, sys
import re
import string
import pathlib
import random
from collections import Counter, OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torchtext
from torchtext import data
from torchtext import vocab

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
print('Python version:',sys.version)
print('Pandas version:',pd.__version__)
print('Pytorch version:', torch.__version__)
print('Torch Text version:', torchtext.__version__)
print('Spacy version:', spacy.__version__)

Python version: 3.7.1 (default, Dec 14 2018, 19:28:38) 
[GCC 7.3.0]
Pandas version: 0.23.4
Pytorch version: 1.0.1.post2
Torch Text version: 0.3.1
Spacy version: 2.0.18


In [4]:
df = pd.read_csv('../input/train.csv', error_bad_lines=False)
df.shape
df.head()

(1306122, 3)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
def split_train_test(df, test_size =0.2):
    train, val = train_test_split(df, test_size=test_size, random_state=42)
    return train.reset_index(drop=True), val.reset_index(drop=True)

In [6]:
df_train, df_val = split_train_test(df, test_size=0.2)

In [8]:
print(df_train.shape)
print(df_train.head())
print(df_train.target.value_counts())

(1044897, 3)
                    qid                                      question_text  \
0  3a820a95342d28ad402f  How is strategic positioning is different from...   
1  9fca299caa0cf8f12eac  What is the best way for promote Facebook mark...   
2  de23f10ad011a6fb13c7  How much energized proton radiation does the I...   
3  d2eef16340896e963a63  Would any Indian men want to marry a women tha...   
4  27d584db9bd46b6ab44e  Which is the best business for startups in Ind...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
0    979943
1     64954
Name: target, dtype: int64


In [9]:
print(df_val.shape)
print(df_val.head())
print(df_val.target.value_counts())

(261225, 3)
                    qid                                      question_text  \
0  56d324bb1e2c29f43b12  What is the most effective classroom managemen...   
1  b9ad893dc78c577f8a63  Can I study abroad after 10th class from Bangl...   
2  6689ebaeeb65b209a412        How can I make friends as a college junior?   
3  ba1e2c4a0fef09671516  How do I download free APK Minecraft: Pocket E...   
4  c9ea2b69bf0d74626f46  Like Kuvera, is "Groww" also a free online inv...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
0    245369
1     15856
Name: target, dtype: int64


In [10]:
df_train.to_csv('../input/df_train.csv', index=False)
df_val.to_csv('../input/df_val.csv', index=False)

In [31]:
df_train.head()

Unnamed: 0,qid,question_text,target
0,3a820a95342d28ad402f,How is strategic positioning is different from...,0
1,9fca299caa0cf8f12eac,What is the best way for promote Facebook mark...,0
2,de23f10ad011a6fb13c7,How much energized proton radiation does the I...,0
3,d2eef16340896e963a63,Would any Indian men want to marry a women tha...,0
4,27d584db9bd46b6ab44e,Which is the best business for startups in Ind...,0


In [36]:
type(nlp)

spacy.lang.en.English

In [11]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
def tokenizer(s): return [w.text.lower() for w in nlp(question_clean(s))]

In [41]:
from pdb import set_trace
def question_clean(text):
    text = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
        str(text))
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    text = re.sub(r'https?:/\/\S+', ' ', text)
#     set_trace()
    return text.strip()

In [44]:
txt_field = data.Field(sequential=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
label_field = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

train_val_fields = [
    ('qid', None), 
    ('question_text', txt_field), 
    ('label', label_field)
]

In [45]:
%%time
ds_train, ds_val = data.TabularDataset.splits(path='../input', format='csv', train='df_train.csv',
                                             validation='df_val.csv', fields = train_val_fields, skip_header=True)


CPU times: user 1min 51s, sys: 1.47 s, total: 1min 53s
Wall time: 1min 53s


In [15]:
type(ds_train)

torchtext.data.dataset.TabularDataset

In [17]:
print(len(ds_train))
print(len(ds_val))

1044897
261225


In [43]:
ex = ds_train[0]
print(type(ex))
print(ds_train.fields.items())
print(ex.target)
print(ex.question_text)

<class 'torchtext.data.example.Example'>
dict_items([('qid', None), ('target', <torchtext.data.field.Field object at 0x7f4246ec2828>), ('question_text', <torchtext.data.field.Field object at 0x7f4246ec2048>)])
['how', 'is', 'strategic', 'positioning', 'is', 'different', 'from', 'marketing', 'positioning']
0


In [30]:
ex.question_text

['0']

In [20]:
vec = vocab.Vectors('glove.840B.300d.txt', '../input/glove.840B.300d/')

100%|█████████▉| 2195556/2196017 [03:35<00:00, 11708.86it/s]

In [46]:
txt_field.build_vocab(ds_train, ds_val, max_size=100000, vectors=vec)
label_field.build_vocab(ds_train)

In [47]:
txt_field.vocab.vectors.shape

torch.Size([100002, 300])

In [56]:
dl_train, dl_val = data.BucketIterator.splits(datasets=(ds_train, ds_val), 
                                            batch_sizes=(3,3), 
                                            sort_key=lambda x: len(x.question_text), 
                                            device=device, 
                                            sort_within_batch=True, 
                                            repeat=False)

In [49]:
len(dl_train)

348299

In [50]:
len(dl_val)

87075

In [57]:
batch = next(iter(dl_train))
type(batch)

torchtext.data.batch.Batch

In [58]:
batch.label

tensor([1, 0, 0], device='cuda:0')

In [59]:
batch.question_text #returns word indices and lengths

(tensor([[    4,     3,    27],
         [   18,     4,   716],
         [  207,     2,    68],
         [    2,  2147,  8673],
         [ 2130,   541,   268],
         [   43,     8,   367],
         [16289,   327,    54],
         [ 4460,     7,   448],
         [   93,  1234,   435],
         [14790,   263,     1]], device='cuda:0'),
 tensor([10, 10,  9], device='cuda:0'))

In [60]:
batch.dataset.fields

{'qid': None,
 'question_text': <torchtext.data.field.Field at 0x7f41281e46d8>,
 'label': <torchtext.data.field.Field at 0x7f41281e45f8>}

In [61]:
txt_field.vocab.itos[1]

'<pad>'

In [62]:
def idxtosent(batch, idx):
    return ' '.join([txt_field.vocab.itos[i] for i in batch.question_text[0][:,idx].cpu().data.numpy()])

In [63]:
idxtosent(batch, 0)

'is it true the russians who liberated berlin were beasts'

In [64]:
idxtosent(batch, 1)

'what is the traveling speed of earth in solar family'

In [65]:
idxtosent(batch,2)

'does eating more spicy food cause any health problem <pad>'

In [95]:
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            XX = list(X)
            lengths = XX[1]
            if 0 in lengths:
                continue
            else:
                y = getattr(batch, self.y_field)
                yield (X,y)

In [96]:
train_batch_it = BatchGenerator(dl_train, 'question_text', 'label')
next(iter(train_batch_it))

((tensor([[   3,  397,   10],
          [   4,   40,   16],
          [ 320,    2,    5],
          [  38,  864,  408],
          [ 171,   28,  214],
          [1047, 3842,  869],
          [   8, 5825,   95],
          [ 316, 3012, 1043]], device='cuda:0'),
  tensor([8, 8, 8], device='cuda:0')),
 tensor([0, 0, 1], device='cuda:0'))

In [70]:
vocab_size = len(txt_field.vocab)
embedding_dim = 300
n_hidden = 64
n_out = 2

In [71]:

class SimpleGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, bidirectional=True):
        super().__init__()
        self.vocab_size,self.embedding_dim,self.n_hidden,self.n_out,self.bidirectional = vocab_size, embedding_dim, n_hidden, n_out, bidirectional
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec)
        self.emb.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional)
        self.out = nn.Linear(self.n_hidden, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1) # batch size
        seq = seq.transpose(0,1)
        self.h = self.init_hidden(bs) # initialize hidden state of GRU
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths) # unpad
        gru_out, self.h = self.gru(embs, self.h) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        gru_out, lengths = pad_packed_sequence(gru_out) # pad the sequence to the max length in the batch
        # since it is as classification problem, we will grab the last hidden state
        outp = self.out(self.h[-1]) # self.h[-1] contains hidden state of last timestep
#         return F.log_softmax(outp, dim=-1)
        return F.log_softmax(outp)
    
    def init_hidden(self, batch_size): 
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden)).to(device)
        else:
            return torch.zeros((1,batch_size,self.n_hidden)).to(device)

In [None]:
def fit(model, train_dl, val_dl, loss_fn, opt, epochs=3):
    num_batch = len(train_dl)
    for epoch in tnrange(epochs):      
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0          
        
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        for (X,lengths),y in t:
            t.set_description(f'Epoch {epoch}')
            lengths = lengths.cpu().numpy()
            
            opt.zero_grad()
            #try:
            pred = model(X, lengths)
#             except:
#                 set_trace()
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            
            t.set_postfix(loss=loss.item())
            pred_idx = torch.max(pred, dim=1)[1]
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred_idx.cpu().data.numpy())
            total_loss_train += loss.item()
            
        train_acc = accuracy_score(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        
        if val_dl:
            y_true_val = list()
            y_pred_val = list()
            total_loss_val = 0
            for (X,lengths),y in tqdm_notebook(val_dl, leave=False):
                pred = model(X, lengths.cpu().numpy())
                loss = loss_fn(pred, y)
                pred_idx = torch.max(pred, 1)[1]
                y_true_val += list(y.cpu().data.numpy())
                y_pred_val += list(pred_idx.cpu().data.numpy())
                total_loss_val += loss.item()
            valacc = accuracy_score(y_true_val, y_pred_val)
            valloss = total_loss_val/len(valdl)
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {valloss:.4f} val_acc: {valacc:.4f}')
        else:
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f}')

In [97]:
traindl, valdl = data.BucketIterator.splits(datasets=(ds_train, ds_val), batch_sizes=(512,1024), sort_key=lambda x: len(x.question_text), device=device, sort_within_batch=True, repeat=False)
train_batch_it = BatchGenerator(traindl, 'question_text', 'label')
val_batch_it = BatchGenerator(valdl, 'question_text', 'label')

In [98]:
m = SimpleGRU(vocab_size, embedding_dim, n_hidden, n_out, ds_train.fields['question_text'].vocab.vectors).to(device)
opt = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), 1e-3)
# set_trace()
fit(model=m, train_dl=train_batch_it, val_dl=val_batch_it, loss_fn=F.nll_loss, opt=opt, epochs=5)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))

HBox(children=(IntProgress(value=0, max=256), HTML(value='')))

Epoch 0: train_loss: 0.1214 train_acc: 0.9521 | val_loss: 0.1073 val_acc: 0.9582


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))

HBox(children=(IntProgress(value=0, max=256), HTML(value='')))

Epoch 1: train_loss: 0.1050 train_acc: 0.9583 | val_loss: 0.1029 val_acc: 0.9597


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))

HBox(children=(IntProgress(value=0, max=256), HTML(value='')))

Epoch 2: train_loss: 0.1002 train_acc: 0.9598 | val_loss: 0.1016 val_acc: 0.9594


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))

HBox(children=(IntProgress(value=0, max=256), HTML(value='')))

Epoch 3: train_loss: 0.0966 train_acc: 0.9612 | val_loss: 0.0999 val_acc: 0.9602


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))

HBox(children=(IntProgress(value=0, max=256), HTML(value='')))

Epoch 4: train_loss: 0.0934 train_acc: 0.9624 | val_loss: 0.1020 val_acc: 0.9593


In [104]:
df_test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [None]:
df_test = pd.read_csv('../input/test.csv').reset_index(drop=True)
df_test.to_csv('../input/df_test.csv', index=False)


test_fields = [('qid', None), ('question_text', txt_field)]
ds_test= data.TabularDataset(path='../input/df_test.csv', format='csv', train='test.csv',
                                              fields = test_fields, skip_header=True)


In [107]:
test_iter = data.Iterator(ds_test, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

AttributeError: 'tuple' object has no attribute 'sort_key'

In [103]:
ex = ds_test[0]
print(type(ex))
print(ds_test.fields.items())
print(ex.target)
print(ex.question_text)

<class 'torchtext.data.dataset.TabularDataset'>


AttributeError: 'tuple' object has no attribute 'fields'