### Inspired by:
* https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
* https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
* http://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/
* https://arxiv.org/abs/1607.06450
* https://github.com/keras-team/keras/issues/3878
* https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
* https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
* https://www.kaggle.com/aquatic/entity-embedding-neural-net
* https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate
* https://ai.google/research/pubs/pub46697
* https://blog.openai.com/quantifying-generalization-in-reinforcement-learning/
* https://www.kaggle.com/rasvob/let-s-try-clr-v3
* https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/3%20-%20Faster%20Sentiment%20Analysis.ipynb
* https://www.kaggle.com/ziliwang/pytorch-text-cnn
* https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
* https://github.com/clairett/pytorch-sentiment-classification/blob/master/bilstm.py
* https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html


trying torch...

much harder then keras, but feels more rewarding when done

In [None]:
import numpy as np # linear algebra
import sys

np.set_printoptions(threshold=sys.maxsize)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300"))

# Any results you write to the current directory are saved as output.

import gensim
from gensim.utils import simple_preprocess
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score,precision_recall_fscore_support,recall_score,precision_score
from sklearn.utils import class_weight
import matplotlib.pyplot as plt


#https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
#         print('\rthreshold = %f | score = %f'%(threshold,score),end='')
        if score > best_score:
            best_threshold = threshold
            best_score = score
#     print('best threshold is % f with score %f'%(best_threshold,best_score))
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
import random
import torch

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(6017)
print('Seeding done...')

In [None]:
import torchtext
import random
from nltk import word_tokenize

text = torchtext.data.Field(lower=True, batch_first=True, tokenize=word_tokenize, fix_length=100)
qid = torchtext.data.Field()
target = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True)
train_dataset = torchtext.data.TabularDataset(path='../input/train.csv', format='csv',
                                      fields={'question_text': ('text',text),
                                              'target': ('target',target)})

train, test = train_dataset.split(split_ratio=[0.8,0.2],stratified=True,strata_field='target',random_state=random.getstate())

submission_x = torchtext.data.TabularDataset(path='../input/test.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text', text)})

text.build_vocab(train_dataset, submission_x, min_freq=3)
qid.build_vocab(submission_x)
print('train dataset len:',len(train_dataset))
print('train len:',len(train))
print('test len:',len(test))

In [None]:
glove = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
text.vocab.set_vectors(glove.stoi, glove.vectors, dim=300)

In [None]:
import torchtext.data

batch_size = 512
print('batch_size:',batch_size)
print('---')

train_loader = torchtext.data.BucketIterator(dataset=train,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   sort=False)
test_loader = torchtext.data.BucketIterator(dataset=test,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               sort=False)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import warnings
from sklearn.metrics import accuracy_score
from torch.autograd import Variable

torch.cuda.init()
torch.cuda.empty_cache()
print('CUDA MEM:',torch.cuda.memory_allocated())
print('cuda:', torch.cuda.is_available())
print('cude index:',torch.cuda.current_device())


class SentimentLSTM(nn.Module):
    
    def __init__(self,vocab_vectors,padding_idx,batch_size):
        super(SentimentLSTM,self).__init__()
        print('Vocab vectors size:',vocab_vectors.shape)
        self.batch_size = batch_size
        self.hidden_dim = 128
        self.n_layers = 2 #bidirectional has 2 layers - forward and backward seq
        
        self.embedding = nn.Embedding.from_pretrained(vocab_vectors)
        self.embedding.weight.requires_grad = False
        self.embedding.padding_idx = padding_idx
        
        self.lstm = nn.LSTM(input_size=vocab_vectors.shape[1], hidden_size=self.hidden_dim, bidirectional=True,batch_first=True)        
        self.linear1 = nn.Linear(self.n_layers*self.hidden_dim,self.hidden_dim)        
        self.linear2 = nn.Linear(self.hidden_dim,1)
        self.dropout = nn.Dropout(0.2)

        
    def forward(self,x):
        #init h0,c0
        hidden = (torch.zeros(self.n_layers, x.shape[0], self.hidden_dim).cuda(),
                torch.zeros(self.n_layers, x.shape[0], self.hidden_dim).cuda())
        e = self.embedding(x)
        _, hidden = self.lstm(e, hidden)
        out = torch.cat((hidden[0][-2,:,:], hidden[0][-1,:,:]), dim=1).cuda()
        out = self.linear1(out)
        return self.linear2(self.dropout(F.relu(out)))
    
class SentimentBase(nn.Module):
    
    def __init__(self):
        super(SentimentBase,self).__init__()
        
        self.embedding = nn.Embedding(75966,300)        
        self.linear1 = nn.Linear(300*100,128)
        self.linear2 = nn.Linear(128,1)
    
    def forward(self,x):
        emb = self.embedding(x)
        pooled = emb.reshape((emb.shape[0],emb.shape[1]*emb.shape[2]))
        out = self.linear1(pooled)
        out = self.linear2(F.relu(out))
        return out

    
class SentimentCNN(nn.Module):
    
    def __init__(self,vocab_vectors,padding_idx,batch_size):
        super(SentimentCNN,self).__init__()
        print('Vocab vectors size:',vocab_vectors.shape)
        self.batch_size = batch_size
        self.hidden_dim = 128
        
        self.embedding = nn.Embedding.from_pretrained(vocab_vectors)
        self.embedding.weight.requires_grad = False
        self.embedding.padding_idx = padding_idx
        
        self.cnns =  nn.ModuleList([nn.Conv1d(in_channels=vocab_vectors.shape[1], out_channels=self.hidden_dim, kernel_size=k) for k in [3,4,5]])
        
        self.linear1 = nn.Linear(3*self.hidden_dim,self.hidden_dim)        
        self.linear2 = nn.Linear(self.hidden_dim,1)
        self.dropout = nn.Dropout(0.2)

    @staticmethod
    def conv_and_max_pool(x, conv):
        """Convolution and global max pooling layer"""
        return F.relu(conv(x).permute(0, 2, 1).max(1)[0])
        
    # https://github.com/gaussic/text-classification/blob/master/cnn_pytorch.py
    def forward(self,x):
        e = self.embedding(x)
         # Conv1d takes in (batch, channels, seq_len), but raw embedded is (batch, seq_len, channels)
        e = e.permute(0,2,1)
        cnn_outs = []
        for conv in self.cnns:
            f =self.conv_and_max_pool(e,conv)
            cnn_outs.append(f)
        out = torch.cat(cnn_outs, dim=1).cuda()
        out = self.linear1(out)
        return self.linear2( self.dropout(F.relu(out)))


class SentimentGRU(nn.Module):
    
    def __init__(self,vocab_vectors,padding_idx,batch_size):
        super(SentimentGRU,self).__init__()
        print('Vocab vectors size:',vocab_vectors.shape)
        self.batch_size = batch_size
        self.hidden_dim = 128
        self.n_layers = 2 #bidirectional has 2 layers - forward and backward seq
        
        self.embedding = nn.Embedding.from_pretrained(vocab_vectors)
        self.embedding.weight.requires_grad = False
        self.embedding.padding_idx = padding_idx
        
        self.gru = nn.GRU(input_size=vocab_vectors.shape[1], hidden_size=self.hidden_dim, bidirectional=True,batch_first=True)        
        self.linear1 = nn.Linear(self.n_layers*self.hidden_dim,self.hidden_dim)        
        self.linear2 = nn.Linear(self.hidden_dim,1)
        self.dropout = nn.Dropout(0.2)

        
    def forward(self,x):
        #init h0,c0
        hidden = torch.zeros(self.n_layers, x.shape[0], self.hidden_dim).cuda()
        e = self.embedding(x)
        _, hidden = self.gru(e, hidden)
        out = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1).cuda()
        out = self.linear1(out)
        return self.linear2( self.dropout(F.relu(out)))
    
def train(model,filename,epochs=3):    
    loss_function = nn.BCEWithLogitsLoss().cuda()        
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
    #     print('-----%d-----'%epoch)
        model.train()
        avg_loss = 0
        for batch,train_batch in enumerate(list(iter(train_loader)),1):
            optimizer.zero_grad()

            y_pred = model(train_batch.text.cuda()).squeeze(1)
            y_true = train_batch.target.float().cuda()
            loss = loss_function(y_pred,y_true)
            avg_loss += loss.item()
            loss.backward()
            optimizer.step()

        print('EPOCH: ',epoch,': ',avg_loss/batch)
        print('-'*80)

    torch.save(model.state_dict(), filename )
    print('Training finished....')


In [None]:
model = SentimentLSTM(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
print(model)
print('-'*80)
train(model,'lstm.pt',3)
print('-'*80)

In [None]:
model = SentimentBase().cuda()
print(model)
print('-'*80)
train(model,'base.pt',5)
print('-'*80)

In [None]:
model = SentimentCNN(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
print(model)
print('-'*80)
train(model,'cnn.pt',3)
print('-'*80)

In [None]:
model = SentimentGRU(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
print(model)
print('-'*80)
train(model,'gru.pt',3)
print('-'*80)

In [None]:
def disable_grad(layer):
    for p in layer.parameters():
        p.requires_grad=False

        
class Ensemble(nn.Module):
    
    def __init__(self,vocab_vectors,padding_idx,batch_size):
        super(Ensemble,self).__init__()
        self.lstm = SentimentLSTM(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
        self.lstm.load_state_dict(torch.load('lstm.pt'))
        disable_grad(self.lstm)
        
        self.gru = SentimentGRU(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
        self.gru.load_state_dict(torch.load('gru.pt'))
        disable_grad(self.gru)
        
        self.cnn = SentimentCNN(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
        self.cnn.load_state_dict(torch.load('cnn.pt'))
        disable_grad(self.cnn)

        self.base = SentimentBase().cuda()
        self.base.load_state_dict(torch.load('base.pt'))
        disable_grad(self.base)
        
        self.l_in = nn.Linear(4,1024,bias=False)        
        self.l_out = nn.Linear(1024,1,bias=False)
          
    def forward(self,x):        
        o1 = self.lstm(x)
        o2 = self.gru(x)
        o3 = self.cnn(x)
        o4 = self.base(x)
        out = torch.cat([o1,o2,o3,o4],1)
        return self.l_out(F.relu(self.l_in(out)))

model = Ensemble(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
print(model)
print('-'*80)
train(model,'ensemble.pt', 3)
print('-'*80)    

In [None]:
print(os.listdir())

model = Ensemble(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], batch_size=batch_size).cuda()
model.load_state_dict(torch.load('ensemble.pt'))


In [None]:
#test f1: 0.6742109752630083 - L(4,4), Softmax, L(4,1)
#test f1: 0.6729460720913231 - L(4,4), Softmax, matmul
# test f1: 0.669876044969732 - L(4,1) with bias 
# test f1: 0.6795008114611898- L(4,1) w/o bias
# test f1: 0.6735841400498521- L(4,4),Soft,L(4,1) w/o bias
# test f1: 0.6797504254112309 -L(4,4),relu,L(4,1) w/o bias
# test f1: 0.6649719184156075 - relu,L(4,1) w/o bias
pred = []
targets = []
with torch.no_grad():
    for test_batch in list(test_loader):
        model.eval()
        x = test_batch.text.cuda()
        pred += torch.sigmoid(model(x).squeeze(1)).cpu().data.numpy().tolist()
        targets += test_batch.target.cpu().data.numpy().tolist()

pred = np.array(pred)
targets =  np.array(targets)
search_result = threshold_search(targets, pred)
pred = (pred > search_result['threshold']).astype(int)
print('test acc:',accuracy_score(pred,targets))
print('test f1:',search_result['f1'])

print('RESULTS ON TEST SET:\n',classification_report(targets,pred))

In [None]:
print('Threshold:',search_result['threshold'])

submission_list = list(torchtext.data.BucketIterator(dataset=submission_x,
                                    batch_size=batch_size,
                                    sort=False,
                                    train=False))
pred = []
with torch.no_grad():
    for submission_batch in submission_list:
        model.eval()
        x = submission_batch.text.cuda()
        pred += torch.sigmoid(model(x).squeeze(1)).cpu().data.numpy().tolist()

pred = np.array(pred)

df_subm = pd.DataFrame()
df_subm['qid'] = [qid.vocab.itos[j] for i in submission_list for j in i.qid.view(-1).numpy()]
# df_subm['prediction'] = test_meta > search_result['threshold']
df_subm['prediction'] = (pred > search_result['threshold']).astype(int)
print(df_subm.head())
df_subm.to_csv('submission.csv', index=False)