# Assignment 6

Develop RNN model in pytorch to solve the following problem:  
    
1. Detect sarcasm 
Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit  
Your quality metric = accuracy  
Randomly select 20% of your data for test set. You can use it only for final perfomance estimation.   
 

Remember, you can use GPU resourses in kaggle kernels.

In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim
import pickle
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator, Dataset
from string import punctuation

import warnings
warnings.simplefilter('ignore')

SEED = 42
np.random.seed(SEED)

## Data

In [2]:
df = pd.read_csv('train-balanced-sarcasm.csv',  encoding='ISO-8859-1')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [3]:
def tokenizer(text):
    return [tok for tok in nltk.tokenize.wordpunct_tokenize(text) if tok not in punctuation]

In [5]:
classes = {'sarcastic': 0, 'common': 1}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

In [None]:
tb_df = TabularDataset('train-balanced-sarcasm.csv',
                       format='csv',
                       fields=[('label', LABEL), ('comment', TEXT), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None)],
                       skip_header=True)

In [None]:
TEXT.build_vocab(tb_df, min_freq=10, vectors="glove.6B.100d")
len(TEXT.vocab.itos)

In [None]:
LABEL.build_vocab(tb_df)

In [None]:
train, test = tb_df.split(0.7, stratified=True)
train, valid = train.split(0.7, stratified=True)

## Training

In [None]:
class MyRNN(nn.Module):   
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyRNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 3)
        
        
    def forward(self, batch):       
        x, x_lengths = batch.comment
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        
        return x

In [None]:
class Trainer:
    def __init__(self, model, train_iterator, test_iterator, valid_iterator):
        self.model = model
        self.train_iterator = train_iterator
        self.test_iterator = test_iterator
        self.valid_iterator = valid_iterator
        
        self.optimizer = optim.Adam(self.model.parameters())
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
        self.criterion = nn.CrossEntropyLoss()
        
    def _train_epoch(self, iterator, curr_epoch):
        self.model.train()
        running_loss = 0

        n_batches = len(iterator)
        iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

        for i, batch in enumerate(iterator):
            self.optimizer.zero_grad()

            pred = self.model(batch)
            loss = self.criterion(pred, batch.label)
            loss.backward()
            self.optimizer.step()

            curr_loss = loss.data.cpu().detach().item()

            loss_smoothing = i / (i+1)
            running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

            iterator.set_postfix(loss='%.5f' % running_loss)

        return running_loss
    
    def _test_epoch(iterator):
        self.model.eval()
        epoch_loss = 0

        n_batches = len(iterator)
        
        with tt.no_grad():
            for batch in iterator:
                pred = model(batch)
                loss = self.criterion(pred, batch.label)
                epoch_loss += loss.data.item()

        return epoch_loss / n_batches

    def nn_train(self, n_epochs):
        early_stopping = 0
        prev_loss = 100500
        es_epochs = 0
        best_epoch = None
        history = pd.DataFrame()

        for epoch in range(n_epochs):
            train_loss = _train_epoch(self.train_iterator, epoch)
            valid_loss = _test_epoch(self.valid_iterator)

            valid_loss = valid_loss
            print('validation loss %.5f' % valid_loss)

            record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
            history = history.append(record, ignore_index=True)

            if early_stopping > 0:
                if valid_loss > prev_loss:
                    es_epochs += 1
                else:
                    es_epochs = 0

                if es_epochs >= early_stopping:
                    best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                    print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                    break

                prev_loss = min(prev_loss, valid_loss)

In [None]:
tt.cuda.empty_cache()
batch_size = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, valid, test),
                                                                      batch_sizes=(batch_size, batch_size, batch_size),
                                                                      shuffle=True,
                                                                      sort_key=lambda x: len(x.comment),
                                                                      sort_within_batch=True)

In [None]:
model = MyRNN(len(TEXT.vocab.itos),
              embed_size=100,
              hidden_size=128,
              )

trainer = Trainer(model, train_iterator, test_iterator, valid_iterator)
trainer.train(10)