# Assignment 6

Develop RNN model in pytorch to solve the following problem:  
    
1. Detect sarcasm 
Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit  
Your quality metric = accuracy  
Randomly select 20% of your data for test set. You can use it only for final perfomance estimation.   
 

Remember, you can use GPU resourses in kaggle kernels.

In [9]:
import pandas as pd
import numpy as np
import nltk
import random
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from string import punctuation
from tqdm import tqdm_notebook
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator, Dataset
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter('ignore')

SEED = 42
np.random.seed(SEED)

## Data

In [2]:
df = pd.read_csv('../input/train-balanced-sarcasm.csv',  encoding='ISO-8859-1')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [3]:
def tokenizer(text):
    return [tok for tok in nltk.tokenize.wordpunct_tokenize(text) if tok not in punctuation]

In [4]:
classes = {'0': 0, '1': 1}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

In [5]:
tb_df = TabularDataset('../input/train-balanced-sarcasm.csv',
                       format='csv',
                       fields=[('label', LABEL), ('comment', TEXT), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None)],
                       skip_header=True)

Используем векторы GloVe размерностью 100.

In [6]:
TEXT.build_vocab(tb_df, 'glove.6B.100d', min_freq=10)
len(TEXT.vocab.itos)

28425

In [7]:
LABEL.build_vocab(tb_df)

In [16]:
train, test = tb_df.split(0.8, stratified=True, random_state=random.getstate())
train, valid = train.split(0.8, stratified=True, random_state=random.getstate())

## Training

Нейросеть очень быстро переобучается - буквально после пары итераций. Поэтому, чтобы переобучение происходило все-таки немного позже, добавим к базовой архитектуре дропаут со значением 0.1.

In [24]:
class MyRNN(nn.Module):   
    def __init__(self, device, vocab_size, embed_size, hidden_size):
        super(MyRNN, self).__init__()
        
        self.device = device
        self.embedding = nn.Embedding(vocab_size, embed_size)

        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True)
        
        self.fc = nn.Linear(hidden_size * 2 * 2, 2)
        self.dropout = tt.nn.Dropout(0.1)
        self.init_weights()
        
    def init_weights(self):
        nn.init.uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)
        
    def forward(self, batch):
        x, x_lengths = batch.comment
        x = x.to(self.device)
        x = self.embedding(x)
        
        batch.label = batch.label.to(self.device)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

Для этой же цели прикрутим L2-penalty - укажем в оптимизаторе параметр weight_decay = 1e-05.

In [30]:
class Trainer:
    def __init__(self, model, train_iterator, test_iterator, valid_iterator):
        self.model = model
        self.train_iterator = train_iterator
        self.test_iterator = test_iterator
        self.valid_iterator = valid_iterator

        self.optimizer = optim.Adam(self.model.parameters(), weight_decay=1e-05)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=5)
        self.criterion = nn.CrossEntropyLoss()
        
    def _train_epoch(self, iterator, curr_epoch):
        self.model.train()
        running_loss = 0

        n_batches = len(iterator)
        iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

        for i, batch in enumerate(iterator):
            self.optimizer.zero_grad()

            pred = self.model(batch)
            loss = self.criterion(pred, batch.label)
            loss.backward()
            self.optimizer.step()

            curr_loss = loss.data.cpu().detach().item()

            loss_smoothing = i / (i+1)
            running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

            iterator.set_postfix(loss='%.5f' % running_loss)

        return running_loss
    
    def _test_epoch(self, iterator):
        self.model.eval()
        epoch_loss = 0

        n_batches = len(iterator)
        scores = []
        
        with tt.no_grad():
            for batch in iterator:
                pred = model(batch)
                loss = self.criterion(pred, batch.label)
                epoch_loss += loss.data.item()
                
                pred = tt.softmax(pred, dim=1)
                pred = tt.argmax(pred, dim=1)
                pred = list(pred.cuda().cpu().numpy())
                true = list(batch.label.cuda().cpu().numpy())
                scores.append(accuracy_score(true, pred))

        return epoch_loss / n_batches, np.mean(scores)

    def nn_train(self, n_epochs):
        early_stopping = 5
        prev_loss = 100500
        es_epochs = 0
        best_epoch = None
        history = pd.DataFrame()
        best_accuracy = 0.0

        for epoch in range(n_epochs):
            train_loss = self._train_epoch(self.train_iterator, epoch)
            valid_loss, current_accuracy = self._test_epoch(self.valid_iterator)
            
            if current_accuracy > best_accuracy:
                tt.save(model.state_dict(), 'best_model')
                best_accuracy = current_accuracy

            valid_loss = valid_loss
            print('validation loss %.5f' % valid_loss)
            print('accuracy score %.5f' % current_accuracy)

            record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
            history = history.append(record, ignore_index=True)

            if early_stopping > 0:
                if valid_loss > prev_loss:
                    es_epochs += 1
                else:
                    es_epochs = 0

                if es_epochs >= early_stopping:
                    best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                    print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                    break

                prev_loss = min(prev_loss, valid_loss)

In [20]:
tt.cuda.empty_cache()
batch_size = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test),
                                                                      batch_sizes=(batch_size, batch_size, batch_size),
                                                                      shuffle=True,
                                                                      sort_key=lambda x: len(x.comment),
                                                                      sort_within_batch=True)

Разгоняемся...

In [21]:
tt.backends.cudnn.benchmark = True

device = tt.device("cuda:0" if tt.cuda.is_available() else "cpu")
print(device)

cuda:0


In [31]:
model = MyRNN(device,
              len(TEXT.vocab.itos),
              embed_size=100,
              hidden_size=128)
model.to(device)

trainer = Trainer(model, train_iterator, test_iterator, valid_iterator)
trainer.nn_train(50)

HBox(children=(IntProgress(value=0, description='epoch 0', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.57535
accuracy score 0.69186


HBox(children=(IntProgress(value=0, description='epoch 1', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.57158
accuracy score 0.69606


HBox(children=(IntProgress(value=0, description='epoch 2', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.57048
accuracy score 0.69629


HBox(children=(IntProgress(value=0, description='epoch 3', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56810
accuracy score 0.69896


HBox(children=(IntProgress(value=0, description='epoch 4', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56874
accuracy score 0.69938


HBox(children=(IntProgress(value=0, description='epoch 5', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56740
accuracy score 0.69840


HBox(children=(IntProgress(value=0, description='epoch 6', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56699
accuracy score 0.69936


HBox(children=(IntProgress(value=0, description='epoch 7', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56920
accuracy score 0.69792


HBox(children=(IntProgress(value=0, description='epoch 8', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56840
accuracy score 0.69918


HBox(children=(IntProgress(value=0, description='epoch 9', max=20217, style=ProgressStyle(description_width='i…

validation loss 0.56977
accuracy score 0.69760


HBox(children=(IntProgress(value=0, description='epoch 10', max=20217, style=ProgressStyle(description_width='…

validation loss 0.57355
accuracy score 0.69643


HBox(children=(IntProgress(value=0, description='epoch 11', max=20217, style=ProgressStyle(description_width='…

validation loss 0.57721
accuracy score 0.69662
Early stopping! best epoch: 6 val 0.56699


## Evaluation

In [32]:
scores = []
model.load_state_dict(tt.load('best_model'))

for batch in test_iterator:
    pred = model(batch)
    pred = tt.softmax(pred, dim=1)
    pred = tt.argmax(pred, dim=1)
    pred = list(pred.cuda().cpu().numpy())
    true = list(batch.label.cuda().cpu().numpy())
    scores.append(accuracy_score(true, pred))

In [33]:
print(np.mean(scores))

0.701968941552275


Кажется, бэйзлайн побит.