In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
import time
from sklearn.model_selection import train_test_split

### Torch Packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
# index 314 has no text
df.dropna(inplace=True)

In [6]:
df_train, df_val, _, _ = train_test_split(df, 
                                          df['selected_text'], 
                                          test_size=.05, 
                                          random_state=42)

In [7]:
# 26k samples in training to 1k samples in validation
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

### Preprocessing
* Lowercase - possible since the predicted sentiment text and the selected_text will be lowercase when computing metric.
* punctuation - keep the punctuation given that the submission file states that need to be quoted and complete. 
* Numericalize - Turn each token into its corresponding tokens.

In [8]:
def preprocessing(sentence):
    """
    This function will preprocess the input sentence sequence to avoid any further preprocessing
    downstream.
    """
    return sentence.lower().split()

In [9]:
# lowercasing all the text and turning them into a list of tokens for text and selected text in the training set
df_train['text'] = df_train['text'].progress_apply(preprocessing)
df_val['text'] = df_val['text'].progress_apply(preprocessing)

df_train['selected_text'] = df_train['selected_text'].progress_apply(preprocessing)
df_val['selected_text'] = df_val['selected_text'].progress_apply(preprocessing)

df_test['text'] = df_test['text'].progress_apply(preprocessing)

HBox(children=(FloatProgress(value=0.0, max=26106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3534.0), HTML(value='')))




In [10]:
def unique_words(data):
    """
    Find the number of unique words in the training set.
    """
    words = set()
    for text in data:
        for word in text:
            words.add(word)
    return words

In [11]:
unq_words = unique_words(df_train.text)
len(unq_words)

43859

In [12]:
def create_vocabulary(words):
    vocab2idx, idx = {}, 4
    vocab2idx['<pad>'] = 0
    vocab2idx['<unk>'] = 1
    vocab2idx['<sos>'] = 2
    vocab2idx['<eos>'] = 3
    
    for word in words:
        vocab2idx[word] = idx
        idx += 1
    return vocab2idx

In [13]:
vocab2idx = create_vocabulary(unq_words)
len(vocab2idx)

43863

## Dataset

### Numericalize

In [14]:
def encoding_with_no_padding(sentence, vocab2idx):
    numericalize = [vocab2idx['<sos>']]
    for token in sentence:
        numericalize.append(vocab2idx.get(token, vocab2idx['<unk>']))
    numericalize.append(vocab2idx['<eos>'])
    return numericalize

In [15]:
class tweetDataset(Dataset):
    def __init__(self, data, vocab2idx):
        self.X = [encoding_with_no_padding(x, vocab2idx) for x in data['text']]
        self.y = [encoding_with_no_padding(y, vocab2idx) for y in data['selected_text']]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.y[idx])

In [16]:
# sanity check
train_ds = tweetDataset(df_train, vocab2idx)
valid_ds = tweetDataset(df_val, vocab2idx)

In [17]:
def collate(batch):
    (X, y) = zip(*batch)
    x_len = [len(x) for x in X]
    x_pad = pad_sequence(X, batch_first=True, padding_value=0)
    y_pad = pad_sequence(y, batch_first=True, padding_value=0)
    return x_pad, x_len, y_pad

In [18]:
# sanity check
batch_size = 3
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
x, lengths, y = next(iter(train_dl))

### Seq2Seq

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths):
        x = self.dropout(self.vocabs(x))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x)
        return out, torch.flatten(hidden.transpose(1,0), 1)

In [106]:
class AttentionDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(2*hidden_dim, vocab_size)
        
    def forward(self, x, hidden, cell, enc_hidden):
        x = self.vocabs(x)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        per_hidden = hidden.permute((1,2,0))
        scores = torch.bmm(enc_hidden, per_hidden)
        alpha = F.softmax(scores, dim=1).permute((0,2,1))  # attention weights
        attention = torch.bmm(alpha, enc_hidden).squeeze(1)  # attention vectors
        final = torch.cat((attention, per_hidden.squeeze()), dim=1)  # concatenate decoder's hidden state with attention
        return self.linear(final), hidden, cell  #NOTE: hidden[-1] not needed because dim = [batch, hidden]

### Encoder-AttentionDecoder Sanity Check


In [107]:
# checking to see if it works
encoder = Encoder(len(vocab2idx), 32, 64, 1)
decoder = AttentionDecoder(len(vocab2idx), 64, 128)

# quick test of the attention to make sure it runs
out, h = encoder(x, lengths)
h = h.unsqueeze(0)
cell = torch.zeros(h.size())
d_out, d_hid, d_cell = decoder(x, h, cell, out)
d_out.size(), d_hid.size(), d_cell.size()

(torch.Size([3, 43863]), torch.Size([1, 3, 128]), torch.Size([1, 3, 128]))

In [94]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
                e_path, d_path, tf_ratio, epochs=10, save_val=False):
    best_val = float('inf')
    for epoch in tqdm(range(epochs)):
        start = time.time()
        total_loss, total = 0, 0
        val_loss, total_v = 0, 0
        ### Training
        for x, lengths, y in train_dl:
            loss = train_batch(encoder, 
                               decoder, 
                               enc_optimizer, 
                               dec_optimizer, 
                               x.to(device), 
                               y.to(device),  
                               lengths, 
                               tf=True,
                               tf_ratio=tf_ratio)
            total_loss += loss*x.size(0)
            total += x.size(0)
        ### Validation
        for x, lengths, y in valid_dl:
            v_loss = train_batch(encoder, 
                                 decoder, 
                                 enc_optimizer, 
                                 dec_optimizer, 
                                 x.to(device), 
                                 y.to(device), 
                                 lengths, 
                                 tf=False, 
                                 train=False)
            val_loss += v_loss*x.size(0)
            total_v += x.size(0)
        if (epoch+1) % 10 == 0:
            print(f"Epoch: {epoch+1} Train Loss: {total_loss/total:.3f} Val Loss: {val_loss/total_v:.3f} Time: {time.time()-start:.3f}")
        
        if save_val:
            if best_val > (val_loss/total_v):
                save_model(encoder, decoder, e_path, d_path)
                best_val = val_loss/total_v
        else:
            if best_val > (total_loss/total):
                save_model(encoder, decoder, e_path, d_path)
                best_val = total_loss/total

In [101]:
def train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x, y, lengths, tf,
                train=True, tf_ratio=0.5):
    if train:
        encoder.train()
        decoder.train()
    else:
        encoder.eval()
        decoder.eval()
    
    # zero grad for both optimizers
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    encoder_outputs, hidden = encoder(x, lengths)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size()).cuda()
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
        y_idx = y[:, idx]
        loss += F.cross_entropy(output, y_idx, ignore_index=0)
        # if teacher forcing
        if tf:
            teacher_force = True if np.random.uniform() > tf_ratio else False
            if teacher_force:
                decoder_input = y_idx.unsqueeze(1)
        else:
            decoder_input = output.argmax(dim=1).unsqueeze(1)
            
    # updating the gradient
    if train:
        loss.backward()
        enc_optimizer.step()
        dec_optimizer.step()
    return loss.item()

In [90]:
def save_model(encoder, decoder, e_path, d_path):
    torch.save(encoder.state_dict(), e_path)
    torch.save(decoder.state_dict(), d_path)
    
def load_model(encoder, decoder, e_path, d_path):
    encoder.load_state_dict(torch.load(e_path))
    decoder.load_state_dict(torch.load(d_path))

## Model Training

In [108]:
# first train of an attention model
encoder = Encoder(len(vocab2idx), 32, 64, 1).to(device)
decoder = AttentionDecoder(len(vocab2idx), 64, 128).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)

In [109]:
batch_size = 256
train_ds = tweetDataset(df_train, vocab2idx)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate)

In [110]:
model_path = 'models/attention/'
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'enc_lr_001_i32_h64.pth',
            model_path + 'att_dec_lr_001_i64_h128.pth',
            0,
            100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Epoch: 10 Train Loss: 152.374 Val Loss: 206.871 Time: 24.248
Epoch: 20 Train Loss: 112.401 Val Loss: 226.791 Time: 24.706
Epoch: 30 Train Loss: 84.293 Val Loss: 233.447 Time: 24.691
Epoch: 40 Train Loss: 67.612 Val Loss: 238.680 Time: 24.771
Epoch: 50 Train Loss: 54.889 Val Loss: 249.034 Time: 24.705
Epoch: 60 Train Loss: 44.477 Val Loss: 248.820 Time: 24.142
Epoch: 70 Train Loss: 35.854 Val Loss: 254.746 Time: 24.827
Epoch: 80 Train Loss: 28.700 Val Loss: 259.474 Time: 24.597
Epoch: 90 Train Loss: 23.251 Val Loss: 260.658 Time: 24.629
Epoch: 100 Train Loss: 18.794 Val Loss: 263.742 Time: 24.885



## Translation Decoding

In [111]:
idx2vocab = {v:k for k, v in list(vocab2idx.items())}

In [112]:
def jaccard(string1, string2):
    a = set(string1.lower().split())
    b = set(string2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c) + 1e-8)

In [123]:
def decoding(encoder, decoder, x, y, lengths):
    decoded_words = []
    total_jaccard, total = 0, 0
    loss = 0
    # **ENCODER**
    encoder_outputs, hidden = encoder(x, lengths)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size()).cuda()
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
        pred = output.argmax(dim=1)
        decoded_words.append(pred)
        decoder_input = pred.unsqueeze(1)
    
    decoded_words = torch.cat([tensor.unsqueeze(0) for tensor in decoded_words]).transpose(1,0)
    
    for i in range(decoded_words.size(0)):
        xi = decoded_words[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        x_sent = ' '.join([idx2vocab[idx] for idx in xi if idx > 3])
        y_sent = ' '.join([idx2vocab[idx] for idx in yi if idx > 3])
        total_jaccard += jaccard(x_sent, y_sent)
        total += y.size(0)
    return total_jaccard

In [117]:
def jac_scoring(encoder, decoder, data_loader):
    encoder.eval()
    decoder.eval()
    jac_score, total = 0, 0
    
    for x, lengths, y in tqdm(data_loader):
        jac_score += decoding(encoder, decoder, x.to(device), y.to(device), lengths)
        total += x.size(0)
    print(f"Jaccard Similarity: {jac_score/total:.3f}")

In [124]:
jac_scoring(encoder, decoder, train_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=102.0), HTML(value='')))


Jaccard Similarity: 0.437


### Translation

In [125]:
translation_train_dl = DataLoader(train_ds, batch_size=3, shuffle=True, collate_fn=collate)
x, lengths, y = next(iter(translation_train_dl))

In [126]:
def translate(encoder, decoder, x, y, lengths):
    decoded_words = []
    total_jaccard, total = 0, 0
    
    encoder_outputs, hidden = encoder(x, lengths)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size()).cuda()
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
        pred = output.argmax(dim=1)
        decoded_words.append(pred)
        decoder_input = pred.unsqueeze(1)
    
    decoded_words = torch.cat([tensor.unsqueeze(0) for tensor in decoded_words]).transpose(1,0)
    
    for i in range(decoded_words.size(0)):
        xi = decoded_words[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        x_sent = ' '.join([idx2vocab[idx] for idx in xi if idx > 3])
        y_sent = ' '.join([idx2vocab[idx] for idx in yi if idx > 3])
        print(f"Ground Truth")
        print()
        print(y_sent)
        print()
        print(f"Decoded Sentence")
        print(x_sent)
        print()

In [127]:
# this came from the bidirectional model at 100 epochs at 23.18 train loss
translate(encoder, decoder, x.to(device), y.to(device), lengths)

Ground Truth

greeeeeeeeeeeat

Decoded Sentence
i have an hour this weekend i

Ground Truth

i`ll give u an icecream ?

Decoded Sentence
i`ll u know why

Ground Truth

stressed

Decoded Sentence
haha im work and having a pizza



In [128]:
x, lengths, y = next(iter(translation_train_dl))
translate(encoder, decoder, x.to(device), y.to(device), lengths)

Ground Truth

stupid

Decoded Sentence
stupid

Ground Truth

dead.

Decoded Sentence
omg 44 in

Ground Truth

feel sad

Decoded Sentence
yeah it is



In [129]:
x, lengths, y = next(iter(translation_train_dl))
translate(encoder, decoder, x.to(device), y.to(device), lengths)

Ground Truth

i love you

Decoded Sentence
i love you

Ground Truth

summer is finally here! graduation in a week!

Decoded Sentence
awesome in a house

Ground Truth

it`s ace!

Decoded Sentence
? it`s new songs out and we got new



### Further Training

In [130]:
model_path = 'models/attention/'
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'ct_enc_lr_001_i32_h64.pth',
            model_path + 'ct_att_dec_lr_001_i64_h128.pth',
            0,
            100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Epoch: 10 Train Loss: 15.398 Val Loss: 267.406 Time: 24.809
Epoch: 20 Train Loss: 12.965 Val Loss: 274.353 Time: 24.782
Epoch: 30 Train Loss: 11.366 Val Loss: 285.141 Time: 24.694
Epoch: 40 Train Loss: 9.591 Val Loss: 290.908 Time: 24.691
Epoch: 50 Train Loss: 8.610 Val Loss: 289.903 Time: 24.668
Epoch: 60 Train Loss: 7.708 Val Loss: 296.292 Time: 24.710
Epoch: 70 Train Loss: 8.111 Val Loss: 296.320 Time: 24.075
Epoch: 80 Train Loss: 6.161 Val Loss: 310.696 Time: 24.694
Epoch: 90 Train Loss: 5.683 Val Loss: 305.082 Time: 24.064
Epoch: 100 Train Loss: 5.393 Val Loss: 313.788 Time: 24.527



In [131]:
jac_scoring(encoder, decoder, train_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=102.0), HTML(value='')))


Jaccard Similarity: 0.613


In [132]:
x, lengths, y = next(iter(translation_train_dl))
translate(encoder, decoder, x.to(device), y.to(device), lengths)

Ground Truth

yea dude... 5`8'! really!?

Decoded Sentence
yea dude... 5`8'! really!?

Ground Truth

excited.

Decoded Sentence
excited. got our politicians night. i`m so i`m so i`m

Ground Truth

nooo! i`m ill must have paracetamol! nice weather today, though.

Decoded Sentence
nooo! i`m ill must have paracetamol! nice weather today, though.



In [133]:
x, lengths, y = next(iter(translation_train_dl))
translate(encoder, decoder, x.to(device), y.to(device), lengths)

Ground Truth

favourites.

Decoded Sentence
favourites. and the first was off after there was 2!

Ground Truth

beer is an excellent excuse. earlier i was sweating god knows how much! not looking forward to working tomorrow

Decoded Sentence
beer is an excellent excuse. earlier i was sweating god knows how much! not looking forward to working tomorrow

Ground Truth

super stressful day & can`t sleep...glad i found you your so sweet

Decoded Sentence
super stressful day & can`t sleep...glad i found you your so sweet

