In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
import time
from sklearn.model_selection import train_test_split

### Torch Packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
# index 314 has no text
df.dropna(inplace=True)

In [6]:
df_train, df_val, _, _ = train_test_split(df, 
                                          df['selected_text'], 
                                          test_size=.05, 
                                          random_state=42)

In [7]:
# 26k samples in training to 1k samples in validation
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

### Preprocessing
* Lowercase - possible since the predicted sentiment text and the selected_text will be lowercase when computing metric.
* punctuation - keep the punctuation given that the submission file states that need to be quoted and complete. 
* Numericalize - Turn each token into its corresponding tokens.

In [8]:
def preprocessing(sentence):
    """
    This function will preprocess the input sentence sequence to avoid any further preprocessing
    downstream.
    """
    return sentence.lower().split()

In [9]:
# lowercasing all the text and turning them into a list of tokens for text and selected text in the training set
df_train['text'] = df_train['text'].progress_apply(preprocessing)
df_val['text'] = df_val['text'].progress_apply(preprocessing)

df_train['selected_text'] = df_train['selected_text'].progress_apply(preprocessing)
df_val['selected_text'] = df_val['selected_text'].progress_apply(preprocessing)

df_test['text'] = df_test['text'].progress_apply(preprocessing)

HBox(children=(FloatProgress(value=0.0, max=26106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3534.0), HTML(value='')))




In [10]:
# making the sentiment to a variable
df_train['sentiment'] = df_train['sentiment'].astype('category')
df_train['code'] = df_train['sentiment'].cat.codes

df_val['sentiment'] = df_val['sentiment'].astype('category')
df_val['code'] = df_val['sentiment'].cat.codes

df_test['sentiment'] = df_test['sentiment'].astype('category')
df_test['code'] = df_test['sentiment'].cat.codes

In [11]:
def unique_words(data):
    """
    Find the number of unique words in the training set.
    """
    words = set()
    for text in data:
        for word in text:
            words.add(word)
    return words

In [12]:
unq_words = unique_words(df_train.text)
len(unq_words)

43859

In [13]:
def create_vocabulary(words):
    vocab2idx, idx = {}, 4
    vocab2idx['<pad>'] = 0
    vocab2idx['<unk>'] = 1
    vocab2idx['<sos>'] = 2
    vocab2idx['<eos>'] = 3
    
    for word in words:
        vocab2idx[word] = idx
        idx += 1
    return vocab2idx

In [14]:
vocab2idx = create_vocabulary(unq_words)
len(vocab2idx)

43863

## Dataset

### Numericalize

In [20]:
def encoding_with_no_padding(sentence, vocab2idx):
    numericalize = [vocab2idx['<sos>']]
    for token in sentence:
        numericalize.append(vocab2idx.get(token, vocab2idx['<unk>']))
    numericalize.append(vocab2idx['<eos>'])
    return numericalize

In [31]:
class tweetDataset(Dataset):
    def __init__(self, data, vocab2idx):
        self.X = [encoding_with_no_padding(x, vocab2idx) for x in data['text']]
        self.y = [encoding_with_no_padding(y, vocab2idx) for y in data['selected_text']]
        #self.sentiment = data.code.values
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.y[idx])

In [32]:
# sanity check
train_ds = tweetDataset(df_train, vocab2idx)
valid_ds = tweetDataset(df_val, vocab2idx)

In [33]:
def collate(batch):
    (X, y, s) = zip(*batch)
    x_len = [len(x) for x in X]
    x_pad = pad_sequence(X, batch_first=True, padding_value=0)
    y_pad = pad_sequence(y, batch_first=True, padding_value=0)
    return x_pad, x_len, y_pad

In [24]:
# sanity check
batch_size = 3
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
x, lengths, y, s = next(iter(train_dl))

### Seq2Seq

In [162]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, sentiment):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.sentiment = nn.Embedding(sentiment, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths):
        x = self.dropout(self.vocabs(x))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x)
        return out, torch.flatten(hidden.transpose(1,0), 1)

In [163]:
#got rid of multi layer
encoder = Encoder(len(vocab2idx), 32, 64, 1, df_train.code.nunique())

In [164]:
out, h = encoder(x, lengths) # h has dimensions [batch_size, hidden_dim]

In [165]:
h.size()

torch.Size([3, 128])

In [145]:
t = torch.randn(3, 64)

In [147]:
torch.cat((t, h), dim=1).size()

torch.Size([3, 128])

In [149]:
out.size(), h.size()

(torch.Size([3, 64]), torch.Size([3, 64]))

In [128]:
# unsqueeze and permute so matrix multiplication can be done
h = h.unsqueeze(0)
h = h.permute((1,2,0))

In [129]:
res = torch.bmm(out, h)

In [130]:
res.size()

torch.Size([3, 17, 1])

In [131]:
alpha = F.softmax(res.squeeze(), dim=1)  ## dim of (3, 17) [batch, seq]

In [133]:
alpha.unsqueeze(1).size()

torch.Size([3, 1, 17])

In [134]:
attention = torch.bmm(alpha.unsqueeze(1),out)

In [150]:
attention.squeeze(1).size()

torch.Size([3, 128])

In [115]:
out.size(), alpha.size()

(torch.Size([3, 17, 128]), torch.Size([3, 17]))

In [83]:
dec = nn.LSTM(len(vocab2idx), 64, 128)

In [152]:
class AttentionDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(2*hidden_dim, vocab_size)
        
    def forward(self, x, hidden, cell, enc_hidden):
        x = self.vocabs(x)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        hidden = hidden.unsqueeze(0).permute((1,2,0))
        scores = torch.bmm(enc_hidden, hidden)
        alpha = F.softmax(scores.squeeze(), dim=1)
        attention = torch.bmm(alpha.unsqueeze(1), enc_hidden).squeeze(1)
        final = torch.cat((attention, hidden), dim=1)
        return self.linear(final[-1]), hidden, cell  #NOTE: hidden[-1] returns everything within that batch

In [166]:
#got rid of multi layer
encoder = Encoder(len(vocab2idx), 32, 64, 1, df_train.code.nunique())
decoder = AttentionDecoder(len(vocab2idx), 64, 128)

In [167]:
out, h = encoder(x, lengths)

In [25]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
                e_path, d_path, tf_ratio, epochs=10, save_val=False):
    best_val = float('inf')
    for epoch in tqdm(range(epochs)):
        start = time.time()
        total_loss, total = 0, 0
        val_loss, total_v = 0, 0
        ### Training
        for x, lengths, y, s in train_dl:
            loss = train_batch(encoder, 
                               decoder, 
                               enc_optimizer, 
                               dec_optimizer, 
                               x.to(device), 
                               y.to(device), 
                               s.to(device), 
                               lengths, 
                               tf=True,
                               tf_ratio=tf_ratio)
            total_loss += loss*x.size(0)
            total += x.size(0)
        ### Validation
        for x, lengths, y, s in valid_dl:
            v_loss = train_batch(encoder, 
                                 decoder, 
                                 enc_optimizer, 
                                 dec_optimizer, 
                                 x.to(device), 
                                 y.to(device), 
                                 s.to(device), 
                                 lengths, 
                                 tf=False, 
                                 train=False)
            val_loss += v_loss*x.size(0)
            total_v += x.size(0)
        if (epoch+1) % 10 == 0:
            print(f"Epoch: {epoch+1} Train Loss: {total_loss/total:.3f} Val Loss: {val_loss/total_v:.3f} Time: {time.time()-start:.3f}")
        
        if save_val:
            if best_val > (val_loss/total_v):
                save_model(encoder, decoder, e_path, d_path)
                best_val = val_loss/total_v
        else:
            if best_val > (total_loss/total):
                save_model(encoder, decoder, e_path, d_path)
                best_val = total_loss/total

In [26]:
def train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x, y, s, lengths, tf,
                train=True, tf_ratio=0.5):
    if train:
        encoder.train()
        decoder.train()
    else:
        encoder.eval()
        decoder.eval()
    
    # zero grad for both optimizers
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    hidden = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size()).cuda()
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        y_idx = y[:, idx]
        loss += F.cross_entropy(output, y_idx, ignore_index=0)
        # if teacher forcing
        if tf:
            teacher_force = True if np.random.uniform() > tf_ratio else False
            if teacher_force:
                decoder_input = y_idx.unsqueeze(1)
        else:
            decoder_input = output.argmax(dim=1).unsqueeze(1)
            
    # updating the gradient
    if train:
        loss.backward()
        enc_optimizer.step()
        dec_optimizer.step()
    return loss.item()

In [27]:
def save_model(encoder, decoder, e_path, d_path):
    torch.save(encoder.state_dict(), e_path)
    torch.save(decoder.state_dict(), d_path)
    
def load_model(encoder, decoder, e_path, d_path):
    encoder.load_state_dict(torch.load(e_path))
    decoder.load_state_dict(torch.load(d_path))

## Model Training

In [30]:
# purposedly saving overfit model to test for sentence quality
encoder = Encoder(len(vocab2idx), 32, 64, 2, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 288).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)

In [36]:
batch_size = 256
train_ds = tweetDataset(df_train, vocab2idx)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate)

In [34]:
model_path = 'models/multi_bidir/'
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'bidir_multi_enc_lr_001_i32_h64.pth',
            model_path + 'dec_lr_001_i64_h64.pth',
            0,
            100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Epoch: 10 Train Loss: 147.639 Val Loss: 214.796 Time: 26.662
Epoch: 20 Train Loss: 109.788 Val Loss: 240.359 Time: 26.797
Epoch: 30 Train Loss: 86.114 Val Loss: 257.885 Time: 26.630
Epoch: 40 Train Loss: 69.968 Val Loss: 281.450 Time: 26.796
Epoch: 50 Train Loss: 58.125 Val Loss: 292.795 Time: 26.669
Epoch: 60 Train Loss: 48.584 Val Loss: 312.429 Time: 26.633
Epoch: 70 Train Loss: 40.244 Val Loss: 337.797 Time: 26.575
Epoch: 80 Train Loss: 33.345 Val Loss: 361.842 Time: 26.618
Epoch: 90 Train Loss: 27.753 Val Loss: 383.094 Time: 26.760
Epoch: 100 Train Loss: 23.181 Val Loss: 405.456 Time: 26.628



## Translation Decoding

In [28]:
idx2vocab = {v:k for k, v in list(vocab2idx.items())}

In [29]:
def jaccard(string1, string2):
    a = set(string1.lower().split())
    b = set(string2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c) + 1e-8)

In [30]:
def decoding(encoder, decoder, x, y, s, lengths):
    decoded_words = []
    total_jaccard, total = 0, 0
    
    # **ENCODER**
    hidden = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size()).cuda()
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        pred = output.argmax(dim=1)
        decoded_words.append(pred)
        decoder_input = pred.unsqueeze(1)
    
    decoded_words = torch.cat([tensor.unsqueeze(0) for tensor in decoded_words]).transpose(1,0)
    
    for i in range(decoded_words.size(0)):
        xi = decoded_words[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        x_sent = ' '.join([idx2vocab[idx] for idx in xi if idx > 3])
        y_sent = ' '.join([idx2vocab[idx] for idx in yi if idx > 3])
        total_jaccard += jaccard(x_sent, y_sent)
        total += y.size(0)
    return total_jaccard

In [31]:
def jac_scoring(encoder, decoder, data_loader):
    encoder.eval()
    decoder.eval()
    jac_score, total = 0, 0
    
    for x, lengths, y, s in tqdm(data_loader):
        jac_score += decoding(encoder, decoder, x.to(device), y.to(device), s.to(device), lengths)
        total += x.size(0)
    print(f"Jaccard Similarity: {jac_score/total:.3f}")

In [44]:
jac_scoring(encoder, decoder, train_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=102.0), HTML(value='')))


Jaccard Similarity: 0.074


### Translation

In [45]:
translation_train_dl = DataLoader(train_ds, batch_size=3, shuffle=True, collate_fn=collate)
x, lengths, y, s = next(iter(translation_train_dl))

In [32]:
def translate(encoder, decoder, x, y, s, lengths):
    decoded_words = []
    total_jaccard, total = 0, 0
    
    # **ENCODER**
    hidden = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    cell = torch.zeros(hidden.size()).cuda()
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        pred = output.argmax(dim=1)
        decoded_words.append(pred)
        decoder_input = pred.unsqueeze(1)
    
    decoded_words = torch.cat([tensor.unsqueeze(0) for tensor in decoded_words]).transpose(1,0)
    
    for i in range(decoded_words.size(0)):
        xi = decoded_words[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        x_sent = ' '.join([idx2vocab[idx] for idx in xi if idx > 3])
        y_sent = ' '.join([idx2vocab[idx] for idx in yi if idx > 3])
        print(f"Ground Truth")
        print()
        print(y_sent)
        print()
        print(f"Decoded Sentence")
        print(x_sent)
        print()

In [52]:
# this came from the bidirectional model at 100 epochs at 23.18 train loss
translate(encoder, decoder, x.to(device), y.to(device), s.to(device), lengths)

Ground Truth

concert next week cancelled

Decoded Sentence


Ground Truth

too many germï¿½n rodrï¿½guez`s results. which one are you? i mean, how do you look like in your profile picture?

Decoded Sentence
the fame ! . . . . it up and one off and some people will never be in that case

Ground Truth

good

Decoded Sentence
good



### Further training

In [53]:
# purposedly saving overfit model to test for sentence quality
encoder = Encoder(len(vocab2idx), 32, 64, 2, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 288).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)
load_model(encoder, decoder, 
           'models/multi_bidir/bidir_multi_enc_lr_001_i32_h64.pth', 
           'models/multi_bidir/dec_lr_001_i64_h64.pth')

In [54]:
# this came from the bidirectional model at 100 epochs at 23.18 train loss
translate(encoder, decoder, x.to(device), y.to(device), s.to(device), lengths)

Ground Truth

concert next week cancelled

Decoded Sentence


Ground Truth

too many germï¿½n rodrï¿½guez`s results. which one are you? i mean, how do you look like in your profile picture?

Decoded Sentence
the fame ! . . . . it up and one off and some people will never be in that case

Ground Truth

good

Decoded Sentence
thanks for the day



In [55]:
# continuous training
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'bidir_multi_enc_lr_001_i32_h64_ct.pth',
            model_path + 'dec_lr_001_i64_h64_ct.pth',
            0,
            100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Epoch: 10 Train Loss: 19.322 Val Loss: 429.665 Time: 26.717
Epoch: 20 Train Loss: 17.307 Val Loss: 433.187 Time: 26.760
Epoch: 30 Train Loss: 13.995 Val Loss: 474.280 Time: 26.693
Epoch: 40 Train Loss: 13.573 Val Loss: 484.221 Time: 26.519
Epoch: 50 Train Loss: 10.795 Val Loss: 507.179 Time: 26.672
Epoch: 60 Train Loss: 10.727 Val Loss: 528.910 Time: 26.602
Epoch: 70 Train Loss: 8.713 Val Loss: 541.374 Time: 26.617
Epoch: 80 Train Loss: 7.929 Val Loss: 553.851 Time: 26.738
Epoch: 90 Train Loss: 8.037 Val Loss: 563.733 Time: 26.657
Epoch: 100 Train Loss: 6.791 Val Loss: 583.091 Time: 26.360



In [33]:
# purposedly saving overfit model to test for sentence quality
encoder = Encoder(len(vocab2idx), 32, 64, 2, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 288).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=.001)
load_model(encoder, decoder, 
           'models/multi_bidir/bidir_multi_enc_lr_001_i32_h64_ct.pth',
           'models/multi_bidir/dec_lr_001_i64_h64_ct.pth')

In [34]:
# this came from the bidirectional model at 100 epochs at 23.18 train loss
translate(encoder, decoder, x.to(device), y.to(device), s.to(device), lengths)

Ground Truth

really wants a puppy...

Decoded Sentence
re-tweet. a.m! sofa ale

Ground Truth

working 930a-730p today. 1.5 hour lunch..8.5 hours work. overtime = good. the fact that i`m already dead tired = very bad.

Decoded Sentence
big pos.. big planted). sj. amos 16-8! succession. saimee =] hmph plis great.... 2-goodbyes knowww crown! rocked!! odyssey, neverrr

Ground Truth

separate desks, computers, and sides of the room with short dividers...but, we`re thinking we need them to extend to the ceiling

Decoded Sentence
big pos.. big planted). sj. amos 16-8! succession. saimee =] hmph plis great.... 2-goodbyes knowww crown! rocked!! odyssey, neverrr



In [38]:
jac_scoring(encoder, decoder, train_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=102.0), HTML(value='')))


Jaccard Similarity: 0.000


In [None]:
# continuous training
model_path = 'models/multi_bidir/'
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, valid_dl, 
            model_path + 'bidir_multi_enc_lr_001_i32_h64_ct2.pth',
            model_path + 'dec_lr_001_i64_h64_ct2.pth',
            0,
            50)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch: 10 Train Loss: 168.187 Val Loss: 230.240 Time: 26.385
Epoch: 20 Train Loss: 115.341 Val Loss: 254.984 Time: 26.591
Epoch: 30 Train Loss: 79.362 Val Loss: 284.526 Time: 26.592
Epoch: 40 Train Loss: 57.075 Val Loss: 286.976 Time: 26.738
