In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
import time
from sklearn.model_selection import train_test_split

### Torch Packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
# index 314 has no text
df.dropna(inplace=True)

In [6]:
df_train, df_val, _, _ = train_test_split(df, 
                                          df['selected_text'], 
                                          test_size=.05, 
                                          random_state=42)

In [7]:
# 26k samples in training to 1k samples in validation
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

### Preprocessing
* Lowercase - possible since the predicted sentiment text and the selected_text will be lowercase when computing metric.
* punctuation - keep the punctuation given that the submission file states that need to be quoted and complete. 
* Numericalize - Turn each token into its corresponding tokens.

In [8]:
def preprocessing(sentence):
    """
    This function will preprocess the input sentence sequence to avoid any further preprocessing
    downstream.
    """
    return sentence.lower().split()

In [9]:
# lowercasing all the text and turning them into a list of tokens for text and selected text in the training set
df_train['text'] = df_train['text'].progress_apply(preprocessing)
df_val['text'] = df_val['text'].progress_apply(preprocessing)

df_train['selected_text'] = df_train['selected_text'].progress_apply(preprocessing)
df_val['selected_text'] = df_val['selected_text'].progress_apply(preprocessing)

df_test['text'] = df_test['text'].progress_apply(preprocessing)

HBox(children=(FloatProgress(value=0.0, max=26106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3534.0), HTML(value='')))




In [10]:
# making the sentiment to a variable
df_train['sentiment'] = df_train['sentiment'].astype('category')
df_train['code'] = df_train['sentiment'].cat.codes

df_val['sentiment'] = df_val['sentiment'].astype('category')
df_val['code'] = df_val['sentiment'].cat.codes

df_test['sentiment'] = df_test['sentiment'].astype('category')
df_test['code'] = df_test['sentiment'].cat.codes

In [11]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,code
0,9937fa651b,"[flew, home, from, london, to, ni, to, catch, ...","[flew, home, from, london, to, ni, to, catch, ...",neutral,1
1,ae21c1ac38,"[the, exception, for, a, short, dude:, larenz,...","[the, exception, for, a, short, dude:, larenz,...",neutral,1
2,972c0874b2,"[oh, no, matey,, did, you, get, ill?, it, woul...","[it, mea]",negative,0
3,babe8ab5bf,"[i, am, so, sad..., how, come, pooch, hall, fr...","[i, am, so, sad...]",negative,0
4,dc24138465,"[http://twitpic.com/67iab, -, rounding, bases,...","[rounding, bases, -, she, was, fast, during, r...",neutral,1


In [12]:
def unique_words(data):
    """
    Find the number of unique words in the training set.
    """
    words = set()
    for text in data:
        for word in text:
            words.add(word)
    return words

In [13]:
unq_words = unique_words(df_train.text)
len(unq_words)

43859

In [14]:
def create_vocabulary(words):
    vocab2idx, idx = {}, 4
    vocab2idx['<pad>'] = 0
    vocab2idx['<unk>'] = 1
    vocab2idx['<sos>'] = 2
    vocab2idx['<eos>'] = 3
    
    for word in words:
        vocab2idx[word] = idx
        idx += 1
    return vocab2idx

In [15]:
vocab2idx = create_vocabulary(unq_words)
len(vocab2idx)

43863

## Dataset

### Numericalize

In [16]:
def encoding_with_no_padding(sentence, vocab2idx):
    numericalize = [vocab2idx['<sos>']]
    for token in sentence:
        numericalize.append(vocab2idx.get(token, vocab2idx['<unk>']))
    numericalize.append(vocab2idx['<eos>'])
    return numericalize

In [17]:
class tweetDataset(Dataset):
    def __init__(self, data, vocab2idx):
        self.X = [encoding_with_no_padding(x, vocab2idx) for x in data['text']]
        self.y = [encoding_with_no_padding(y, vocab2idx) for y in data['selected_text']]
        self.sentiment = data.code.values
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.y[idx]), self.sentiment[idx]

In [18]:
# sanity check
train_ds = tweetDataset(df_train, vocab2idx)
valid_ds = tweetDataset(df_val, vocab2idx)

In [19]:
def collate(batch):
    (X, y, s) = zip(*batch)
    x_len = [len(x) for x in X]
    x_pad = pad_sequence(X, batch_first=True, padding_value=0)
    y_pad = pad_sequence(y, batch_first=True, padding_value=0)
    return x_pad, x_len, y_pad, torch.LongTensor(s)

In [104]:
# sanity check
batch_size = 3
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
x, lengths, y, s = next(iter(train_dl))

### Seq2Seq

In [69]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, sentiment):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.sentiment = nn.Embedding(sentiment, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths, sentiment):
        x = self.dropout(self.vocabs(x))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x)
        return torch.cat((torch.flatten(hidden.transpose(1,0), 1), self.sentiment(sentiment)), dim=1)

In [107]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden, cell):
        x = self.vocabs(x)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        return self.linear(hidden[-1]), hidden, cell  #NOTE: hidden[-1] returns everything within that batch

In [60]:
h.size(1)

288

In [101]:
encoder = Encoder(len(vocab2idx), 32, 64, 2, 3)
decoder = Decoder(len(vocab2idx), 32, 288)  #NOTE: 288 is the dim after flatten and concatenating the sentiment

In [89]:
emb = nn.Embedding(len(vocab2idx), 32)

In [92]:
e = emb(x)
e.size()

torch.Size([3, 14, 32])

In [113]:
h.size()

torch.Size([1, 3, 288])

In [114]:
cell = torch.zeros(h.size())

In [106]:
#h = encoder(x, lengths, s)
#h = h.unsqueeze(0)
o, hid, s = decoder(x, (h, cell))

### I have added the bidirectional and 2 layers for the encoder. I have also changed the dimensions in the training batch. See how it will work later.

In [86]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
                e_path, d_path, tf_ratio, epochs=10, save_val=False):
    for epoch in tqdm(range(epochs)):
        start = time.time()
        total_loss, total = 0, 0
        val_loss, total_v = 0, 0
        best_val = float('inf')
        ### Training
        for x, lengths, y, s in train_dl:
            loss = train_batch(encoder, 
                               decoder, 
                               enc_optimizer, 
                               dec_optimizer, 
                               x.to(device), 
                               y.to(device), 
                               s.to(device), 
                               lengths, 
                               tf=True,
                               tf_ratio=tf_ratio)
            total_loss += loss*x.size(0)
            total += x.size(0)
        ### Validation
        for x, lengths, y, s in valid_dl:
            v_loss = train_batch(encoder, 
                                 decoder, 
                                 enc_optimizer, 
                                 dec_optimizer, 
                                 x.to(device), 
                                 y.to(device), 
                                 s.to(device), 
                                 lengths, 
                                 tf=False, 
                                 train=False)
            val_loss += v_loss*x.size(0)
            total_v += x.size(0)
        if (epoch+1) % 10 == 0:
            print(f"Epoch: {epoch+1} Train Loss: {total_loss/total:.3f} Val Loss: {val_loss/total_v:.3f} Time: {time.time()-start:.3f}")
        
        if save_val:
            if best_val > (val_loss/total_v):
                save_model(encoder, decoder, e_path, d_path)
                best_val = val_loss/total_v
        else:
            if best_val > (total_loss/total):
                save_model(encoder, decoder, e_path, d_path)
                best_val = total_loss/total

In [87]:
def train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x, y, s, lengths, tf,
                train=True, tf_ratio=0.5):
    if train:
        encoder.train()
        decoder.train()
    else:
        encoder.eval()
        decoder.eval()
    
    # zero grad for both optimizers
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    hidden = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size())
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        y_idx = y[:, idx]
        loss += F.cross_entropy(output, y_idx, ignore_index=0)
        # if teacher forcing
        if tf:
            teacher_force = True if np.random.uniform() > tf_ratio else False
            if teacher_force:
                decoder_input = y_idx.unsqueeze(1)
        else:
            decoder_input = output.argmax(dim=1).unsqueeze(1)
            
    # updating the gradient
    if train:
        loss.backward()
        enc_optimizer.step()
        dec_optimizer.step()
    return loss.item()

In [28]:
def save_model(encoder, decoder, e_path, d_path):
    torch.save(encoder.state_dict(), e_path)
    torch.save(decoder.state_dict(), d_path)
    
def load_model(encoder, decoder, e_path, d_path):
    encoder.load_state_dict(torch.load(e_path))
    decoder.load_state_dict(torch.load(d_path))

## Model Training

In [25]:
encoder = Encoder(len(vocab2idx), 32, 64, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 64).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)

In [90]:
batch_size = 256
train_ds = tweetDataset(df_train, vocab2idx)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate)

In [91]:
model_path = 'models/first_mod/'
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'enc_lr_001_i32_h64_tf0.pth',
            model_path + 'dec_lr_001_i64_h64_tf0.pth',
            0,
            100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Epoch: 10 Train Loss: 122.409 Val Loss: 209.773 Time: 15.686
Epoch: 20 Train Loss: 108.441 Val Loss: 220.423 Time: 15.675
Epoch: 30 Train Loss: 96.750 Val Loss: 228.607 Time: 15.626
Epoch: 40 Train Loss: 88.033 Val Loss: 236.162 Time: 15.745
Epoch: 50 Train Loss: 80.997 Val Loss: 243.644 Time: 15.641
Epoch: 60 Train Loss: 75.322 Val Loss: 255.239 Time: 15.671
Epoch: 70 Train Loss: 70.645 Val Loss: 259.033 Time: 15.653
Epoch: 80 Train Loss: 66.571 Val Loss: 264.205 Time: 15.647
Epoch: 90 Train Loss: 63.090 Val Loss: 273.947 Time: 15.420
Epoch: 100 Train Loss: 60.302 Val Loss: 280.183 Time: 14.971



In [88]:
# purposedly saving overfit model to test for sentence quality
encoder = Encoder(len(vocab2idx), 32, 64, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 64).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)

In [None]:
model_path = 'models/first_mod/'
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'enc_lr_001_i32_h64_tf0_tr.pth',
            model_path + 'dec_lr_001_i64_h64_tf0_tr.pth',
            0,
            200)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Epoch: 10 Train Loss: 175.840 Val Loss: 185.926 Time: 15.709
Epoch: 20 Train Loss: 154.785 Val Loss: 198.189 Time: 15.815
Epoch: 30 Train Loss: 138.802 Val Loss: 206.112 Time: 15.824
Epoch: 40 Train Loss: 124.085 Val Loss: 220.167 Time: 14.997
Epoch: 50 Train Loss: 112.010 Val Loss: 224.912 Time: 15.812
Epoch: 60 Train Loss: 102.239 Val Loss: 237.583 Time: 15.696
Epoch: 70 Train Loss: 94.971 Val Loss: 242.505 Time: 15.019
Epoch: 80 Train Loss: 89.204 Val Loss: 248.680 Time: 15.901
Epoch: 90 Train Loss: 84.402 Val Loss: 258.046 Time: 15.697


## Translation Decoding

In [53]:
idx2vocab = {v:k for k, v in list(vocab2idx.items())}

In [83]:
def jaccard(string1, string2):
    a = set(string1.lower().split())
    b = set(string2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c) + 1e-8)

In [75]:
def decoding(encoder, decoder, x, y, s, lengths):
    decoded_words = []
    total_jaccard, total = 0, 0
    # **ENCODER**
    hidden, cell = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        pred = output.argmax(dim=1)
        decoded_words.append(pred)
        decoder_input = pred.unsqueeze(1)
    
    decoded_words = torch.cat([tensor.unsqueeze(0) for tensor in decoded_words]).transpose(1,0)
    
    for i in range(decoded_words.size(0)):
        xi = decoded_words[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        x_sent = ' '.join([idx2vocab[idx] for idx in xi if idx > 3])
        y_sent = ' '.join([idx2vocab[idx] for idx in yi if idx > 3])
        total_jaccard += jaccard(x_sent, y_sent)
        total += y.size(0)
    return total_jaccard

In [81]:
def jac_scoring(encoder, decoder, data_loader):
    encoder.eval()
    decoder.eval()
    jac_score, total = 0, 0
    
    for x, lengths, y, s in tqdm(data_loader):
        jac_score += decoding(encoder, decoder, x.to(device), y.to(device), s.to(device), lengths)
        total += x.size(0)
    print(f"Jaccard Similarity: {jac_score/total:.3f}")

In [77]:
# Model load
encoder = Encoder(len(vocab2idx), 32, 64, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 64).to(device)
load_model(encoder, decoder, 'models/first_mod/enc_lr_001_i32_h64_tf0.pth', 'models/first_mod/dec_lr_001_i64_h64_tf0.pth')

In [84]:
jac_scoring(encoder, decoder, train_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=8702.0), HTML(value='')))


Jaccard Similarity: 0.000
