In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os, re, random

import warnings
warnings.filterwarnings('ignore') # to suppress some matplotlib deprecation warnings

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch
import torchtext
from torchtext import data, vocab
from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm as tbar
import spacy # lib with tokenizer
# from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import f1_score

In [None]:
path = "../input"
emb_path = "../input/embeddings"

**Reworked this **[**article**](https://medium.com/@sonicboom8/sentiment-analysis-torchtext-55fb57b1fab8)

In [None]:
# Load only for checking fields name and its format
train_df = pd.read_csv('../input/train.csv', nrows=10)
train_df.head()

## Create tokenizer

In [None]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

In [None]:
def tokenizer(s): 
    return tknzr.tokenize(s)

## Load data and split to train and validation set

In [None]:
# define the columns that we want to process and how to process
txt_field = data.Field(sequential=True, tokenize=tokenizer, include_lengths=True,  use_vocab=True)
label_field = data.Field(sequential=False, use_vocab=False, is_target=True)

train_fields = [
    ('qid', None), # we dont need this, so no processing
    ('question_text', txt_field), # process it as text
    ('target', label_field) # process it as label
]
df_file = data.TabularDataset(path=os.path.join(path, 'train.csv'), 
                           format='csv',
                           fields=train_fields, 
                           skip_header=True)

In [None]:
random.seed = 2018
train, valid = df_file.split(split_ratio=0.8, stratified=True, strata_field='target', random_state=random.getstate())

In [None]:
# check it
print('Train size: {}, validation size: {}'.format(len(train), len(valid)))
print('Our fields in data: \n', train.fields.items())
example = train[0]
print('First question: \n', example.question_text)

In [None]:
!ls ../input/embeddings/

Create dir for cache

In [None]:
!mkdir cache

In [None]:
# specify the path to the localy saved vectors
vec = vocab.Vectors(os.path.join(emb_path, 'glove.840B.300d/glove.840B.300d.txt'), cache='cache/')
# build the vocabulary using train and validation dataset and assign the vectors
txt_field.build_vocab(train, valid, max_size=300000, vectors=vec)
# build vocab for labels
label_field.build_vocab(train)

print(txt_field.vocab.vectors.shape)
# torch.Size([274124, 300])

In [None]:
# First 10 number in 300s lengs vector
txt_field.vocab.vectors[txt_field.vocab.stoi['?']][:10]

In [None]:
# Iterator
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y.float())

In [None]:
vocab_size = len(txt_field.vocab)
embedding_dim = 300
n_hidden = 256
n_out = 1
device = 'cuda'

In [None]:
# https://github.com/hpanwar08/sentiment-analysis-torchtext
class ConcatPoolingGRUAdaptive(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, bidirectional=True):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.bidirectional = bidirectional
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec) # load pretrained vectors
        self.emb.weight.requires_grad = False # make embedding non trainable
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional, dropout=0.5)
        if bidirectional:
            self.out = nn.Linear(self.n_hidden*2*2, self.n_out)
        else:
            self.out = nn.Linear(self.n_hidden*2, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1)
        self.h = self.init_hidden(bs)
        seq = seq.transpose(0,1)
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)        
        
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)        
        outp = self.out(torch.cat([avg_pool,max_pool],dim=1))
        return F.sigmoid(outp)
    
    def init_hidden(self, batch_size): 
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden)).to(device)
        else:
            return torch.zeros((1,batch_size,self.n_hidden)).to(device)


In [None]:
traindl, valdl = data.BucketIterator.splits(datasets=(train, valid), # specify train and validation Tabulardataset
                                            batch_sizes=(1024,1024),  # batch size of train and validation
                                            # on what attribute the text should be sorted
                                            sort_key=lambda x: len(x.question_text), 
                                            device='cuda', # -1 mean cpu and 0 or None mean gpu
                                            sort_within_batch=True, 
                                            repeat=False)
        
train_dl = BatchGenerator(traindl, 'question_text', 'target') # use the wrapper to convert Batch to data
val_dl = BatchGenerator(valdl, 'question_text', 'target')

model = ConcatPoolingGRUAdaptive(vocab_size, embedding_dim, n_hidden, n_out, 
                             train.fields['question_text'].vocab.vectors).to(device)

opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 1e-3)
loss_fn=F.binary_cross_entropy

In [None]:
from time import sleep
num_batch = len(train_dl)
epochs=4
for epoch in range(epochs):      
    y_true_train = np.empty(0)
    y_pred_train = np.empty(0)
    total_loss_train = 0          
    model.train()
    for (X,lengths),y in train_dl:
        lengths = lengths.cpu().numpy()
        opt.zero_grad()
        pred = model(X, lengths)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()

        y_true_train = np.concatenate([y_true_train, y.cpu().data.numpy()], axis = 0)
        y_pred_train = np.concatenate([y_pred_train, pred.cpu().squeeze().data.numpy()], axis = 0)
        total_loss_train += loss.item()

    tacc = f1_score(y_true_train, y_pred_train>0.5)
    tloss = total_loss_train/len(train_dl)

    if val_dl:
        model.eval()
        y_true_val = np.empty(0)
        y_pred_val = np.empty(0)
        total_loss_val = 0
        for (X,lengths),y in val_dl:
            pred = model(X, lengths.cpu().numpy())
            loss = loss_fn(pred, y)
            y_true_val = np.concatenate([y_true_val, y.cpu().data.numpy()], axis = 0)
            y_pred_val = np.concatenate([y_pred_val, pred.cpu().squeeze().data.numpy()], axis = 0)
            total_loss_val += loss.item()
        vacc = f1_score(y_true_val, y_pred_val>0.5)
        vloss = total_loss_val/len(valdl)
        print(f'Epoch {epoch}: Train loss: {tloss:.4f} F1: {tacc:.4f} | Validation loss: {vloss:.4f} F1: {vacc:.4f}')
    else:
        print(f'Epoch {epoch}: Train loss: {tloss:.4f} F1: {train_acc:.4f}')

In [None]:
# define the columns that we want to process and how to process
qid_field = data.RawField()

In [None]:
test_fields = [
    ('qid', qid_field), 
    ('question_text', txt_field), 
]

In [None]:
test_df = data.TabularDataset(path=os.path.join(path, 'test.csv'), 
                           format='csv',
                           fields=test_fields, 
                           skip_header=True)

In [None]:
test_df.fields['qid'].is_target = False

In [None]:
test_ld = data.BucketIterator(test_df, batch_size=512, device='cuda',
                              sort_key=lambda x: len(x.question_text), 
                              sort_within_batch=True, 
                              repeat=False)

In [None]:
# Iterator
class BatchGeneratorTest:
    def __init__(self, dl, x_field, qid_field):
        self.dl, self.x_field, self.qid_field = dl, x_field, qid_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            qid = getattr(batch, self.qid_field)
            yield (X, qid)

In [None]:
test_loader = BatchGeneratorTest(test_ld, 'question_text', 'qid')

In [None]:
model.eval()
preds = np.empty((0,1))
qids = []
for ((z, x), y) in test_loader:
    pred = model(z, x.cpu().numpy())
    qids.append(y)
    preds = np.concatenate([preds, pred.cpu().data.numpy()], axis = 0)
preds = preds.flatten()

In [None]:
qids = [item for sublist in qids for item in sublist]

In [None]:
!rm -r cache

In [None]:
submission = pd.DataFrame(data=(preds > 0.5), index=qids,dtype=np.int8, columns=['prediction'])
submission.index.name = 'qid'
submission.to_csv('submission.csv')