In [1]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [2]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

### Torch Packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

In [4]:
df_train = pd.read_csv('data/train.csv')
df_train.dropna(inplace=True)
df_train, df_val = train_test_split(df_train, test_size=3000)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

df_test = pd.read_csv('data/test.csv')
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,641555ac70,"Hmmm, maybe that`s what they meant. They elud...","Hmmm, maybe that`s what they meant. They elude...",neutral
1,d618862cbd,i want magic mountain tix but i dont get ur s...,o sux,negative
2,a8b73cf5d5,Screw you guys and your prom pictures,Screw you guys,negative
3,2ff62cd38d,Coffee in hand and sun shining in my window. ...,Hope everyone`s having a great Monday,positive
4,4381295572,Off to dinner with & his fam.,is fam.,positive


In [5]:
df_train['text'] = df_train['text'].apply(lambda x: x.lower())
df_train['selected_text'] = df_train['selected_text'].apply(lambda x: x.lower())

df_val['text'] = df_val['text'].apply(lambda x: x.lower())
df_test['text'] = df_test['text'].apply(lambda x: x.lower())

# Get All unique characters (Vocab)

In [6]:
vocab2idx = {'<PAD>':0, 'UNK':1, '<sos>':2, '<eos>':3}
chars = ['<PAD>', 'UNK', '<sos>', '<eos>']
for c in set(' '.join(df_train.text)):
    vocab2idx[c] = len(chars)
    chars.append(c)

In [7]:
len(chars)

79

# Encode

In [8]:
df_train['encoded_text'] = df_train.text.apply(lambda x: [vocab2idx['<sos>']]+[vocab2idx.get(c, vocab2idx['UNK']) for c in list(x)]+[vocab2idx['<eos>']])
df_val['encoded_text'] = df_val.text.apply(lambda x: [vocab2idx['<sos>']]+[vocab2idx.get(c, vocab2idx['UNK']) for c in list(x)]+[vocab2idx['<eos>']])
df_test['encoded_text'] = df_test.text.apply(lambda x: [vocab2idx['<sos>']]+[vocab2idx.get(c, vocab2idx['UNK']) for c in list(x)]+[vocab2idx['<eos>']])

In [9]:
df_train['encoded_selected'] = df_train.selected_text.apply(lambda x: [vocab2idx['<sos>']]+[vocab2idx.get(c, vocab2idx['UNK']) for c in list(x)]+[vocab2idx['<eos>']])
df_val['encoded_selected'] = df_val.selected_text.apply(lambda x: [vocab2idx['<sos>']]+[vocab2idx.get(c, vocab2idx['UNK']) for c in list(x)]+[vocab2idx['<eos>']])

In [10]:
sent_le = LabelEncoder().fit(df_train.sentiment)
df_train['encode_sent'] = sent_le.transform(df_train.sentiment)
df_val['encode_sent'] = sent_le.transform(df_val.sentiment)
df_test['encode_sent'] = sent_le.transform(df_test.sentiment)

In [11]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,encoded_text,encoded_selected,encode_sent
0,641555ac70,"hmmm, maybe that`s what they meant. they elud...","hmmm, maybe that`s what they meant. they elude...",neutral,"[2, 74, 48, 12, 12, 12, 42, 74, 12, 54, 26, 68...","[2, 48, 12, 12, 12, 42, 74, 12, 54, 26, 68, 60...",1
1,d618862cbd,i want magic mountain tix but i dont get ur s...,o sux,negative,"[2, 74, 37, 74, 61, 54, 10, 75, 74, 12, 54, 19...","[2, 78, 74, 56, 38, 62, 3]",0
2,a8b73cf5d5,screw you guys and your prom pictures,screw you guys,negative,"[2, 56, 20, 13, 60, 61, 74, 26, 78, 38, 74, 19...","[2, 56, 20, 13, 60, 61, 74, 26, 78, 38, 74, 19...",0
3,2ff62cd38d,coffee in hand and sun shining in my window. ...,hope everyone`s having a great monday,positive,"[2, 20, 78, 63, 63, 60, 60, 74, 37, 10, 74, 48...","[2, 48, 78, 67, 60, 74, 60, 49, 60, 13, 26, 78...",2
4,4381295572,off to dinner with & his fam.,is fam.,positive,"[2, 78, 63, 63, 74, 75, 78, 74, 8, 37, 10, 10,...","[2, 37, 56, 74, 63, 54, 12, 15, 3]",2


# Dataset

In [12]:
class Tweets(Dataset):
    def __init__(self, df, tv=True):
        self.X = df.encoded_text.values
        self.sent = df.encode_sent.values
        self.y = df.encoded_selected.values if tv else np.zeros((len(self.X), 1))
    
    def __len__(self): return len(self.X)
    
    def __getitem__(self, idx):
        return torch.Tensor(self.X[idx]), torch.Tensor(self.y[idx]), torch.Tensor([self.sent[idx]])

In [13]:
train_ds = Tweets(df_train)
val_ds = Tweets(df_val)
test_ds = Tweets(df_test, tv=False)

In [14]:
vocab2idx[' ']

74

In [15]:
train_ds[0]

(tensor([ 2., 74., 48., 12., 12., 12., 42., 74., 12., 54., 26., 68., 60., 74.,
         75., 48., 54., 75., 69., 56., 74., 61., 48., 54., 75., 74., 75., 48.,
         60., 26., 74., 12., 60., 54., 10., 75., 15., 74., 75., 48., 60., 26.,
         74., 60., 45., 38.,  8., 60.,  8., 74., 75., 78., 74., 56., 78., 12.,
         60., 75., 48., 37., 10., 19., 74., 68., 13., 54., 10.,  8., 74., 10.,
         60., 61., 74., 68., 38., 75., 74., 26., 78., 38., 74., 18., 10., 78.,
         61., 74., 48., 78., 61., 74., 75., 48., 60., 74., 12., 60.,  8., 37.,
         54., 74., 37., 56.,  3.]),
 tensor([ 2., 48., 12., 12., 12., 42., 74., 12., 54., 26., 68., 60., 74., 75.,
         48., 54., 75., 69., 56., 74., 61., 48., 54., 75., 74., 75., 48., 60.,
         26., 74., 12., 60., 54., 10., 75., 15., 74., 75., 48., 60., 26., 74.,
         60., 45., 38.,  8., 60.,  8., 74., 75., 78., 74., 56., 78., 12., 60.,
         75., 48., 37., 10., 19., 74., 68., 13., 54., 10.,  8., 74., 10., 60.,
         61., 74

In [16]:
def collate(batch):
    (X, y, s) = zip(*batch)
    x_len = [len(x) for x in X]
    x_pad = pad_sequence(X, batch_first=True, padding_value=0)
    y_pad = pad_sequence(y, batch_first=True, padding_value=0)
    return x_pad, x_len, y_pad, torch.Tensor(s)

In [17]:
batch_size = 256
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate)

# Model

In [18]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, sentiment):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.sentiment = nn.Embedding(sentiment, hidden_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths, sentiment):
        x = self.dropout(self.vocabs(x.long()))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x)
        return hidden[-1] + self.sentiment(sentiment.long()), cell

In [19]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden, cell):
        x = self.vocabs(x.long())
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        return self.linear(hidden[-1]), hidden, cell  #NOTE: hidden[-1] returns everything within that batch

In [20]:
def train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x, y, s, lengths,
                teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    hidden, cell = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        y_idx = y[:, idx]
        loss += F.cross_entropy(output, y_idx.long(), ignore_index=0)
        teacher_force = True if np.random.uniform() > teacher_forcing_ratio else False
        if teacher_force:
            decoder_input = y_idx.unsqueeze(1)
        else:
            decoder_input = output.argmax(dim=1).unsqueeze(1)
    
    # updating the gradient
    loss.backward()
    enc_optimizer.step()
    dec_optimizer.step()
    return loss.item()

In [21]:
def val_batch(encoder, decoder, x, y, s, lengths,
                teacher_forcing_ratio=0.5):
    loss = 0
    # **ENCODER**
    hidden, cell = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        y_idx = y[:, idx]
        loss += F.cross_entropy(output, y_idx.long(), ignore_index=0)
        teacher_force = True if np.random.uniform() > teacher_forcing_ratio else False
        if teacher_force:
            decoder_input = y_idx.unsqueeze(1)
        else:
            decoder_input = output.argmax(dim=1).unsqueeze(1)
    
    # updating the gradient
    return loss.item()

In [22]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, val_dl, epochs=10):
    for epoch in tqdm(range(epochs)):
        start = time.time()
        total_loss, total = 0, 0
        encoder.train()
        decoder.train()
        for x, lengths, y, s in train_dl:
            loss = train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x.to(device), y.to(device), s.to(device), lengths)
            total_loss += loss*x.size(0)
            total += x.size(0)
        
        encoder.eval()
        decoder.eval()
        total_v, total_loss_v = 0, 0
        for x, lengths, y, s in val_dl:
            loss = val_batch(encoder, decoder, x.to(device), y.to(device), s.to(device), lengths)
            total_loss_v += loss*x.size(0)
            total_v += x.size(0)
        
        print(f"Epoch: {epoch+1} Training Loss: {total_loss/total} Val Loss: {total_loss_v/total_v} Time: {time.time()-start:.3f}")
        

In [23]:
enc = Encoder(len(vocab2idx), 100, 50, 3)
dec = Decoder(len(vocab2idx), 100, 50)
enc_optimizer = optim.Adam(enc.parameters(), lr=3e-4)
dec_optimizer = optim.Adam(dec.parameters(), lr=3e-4)
enc.to(device)
dec.to(device)

Decoder(
  (vocabs): Embedding(79, 100)
  (lstm): LSTM(100, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=79, bias=True)
)

In [24]:
train_model(enc, dec, enc_optimizer, dec_optimizer, train_dl, val_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 1 Training Loss: 523.6732610166463 Val Loss: 459.45677604166667 Time: 23.829
Epoch: 2 Training Loss: 436.15857591816024 Val Loss: 451.70537076822916 Time: 23.572
Epoch: 3 Training Loss: 430.2331030353222 Val Loss: 449.9190830891927 Time: 23.544
Epoch: 4 Training Loss: 425.4427426805683 Val Loss: 447.07378686523435 Time: 24.632
Epoch: 5 Training Loss: 422.061186268127 Val Loss: 444.1272044270833 Time: 25.432
Epoch: 6 Training Loss: 418.4986506044475 Val Loss: 441.6062579752604 Time: 23.183
Epoch: 7 Training Loss: 414.835333850337 Val Loss: 438.0980519205729 Time: 23.175
Epoch: 8 Training Loss: 412.45414177389705 Val Loss: 435.61573706054685 Time: 23.314
Epoch: 9 Training Loss: 409.17613553315203 Val Loss: 435.634916015625 Time: 23.093
Epoch: 10 Training Loss: 406.6850461473652 Val Loss: 432.9018920898437 Time: 23.097

