In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [3]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
import time

### Torch Packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
# index 314 has no text
df_train.dropna(inplace=True)

### Preprocessing
* Lowercase - possible since the predicted sentiment text and the selected_text will be lowercase when computing metric.
* punctuation - keep the punctuation given that the submission file states that need to be quoted and complete. 
* Numericalize - Turn each token into its corresponding tokens.

In [7]:
def preprocessing(sentence):
    """
    This function will preprocess the input sentence sequence to avoid any further preprocessing
    downstream.
    """
    return sentence.lower().split()

In [8]:
# lowercasing all the text and turning them into a list of tokens for text and selected text in the training set
df_train['text'] = df_train['text'].progress_apply(preprocessing)
df_train['selected_text'] = df_train['selected_text'].progress_apply(preprocessing)
df_test['text'] = df_test['text'].progress_apply(preprocessing)

HBox(children=(FloatProgress(value=0.0, max=27480.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=27480.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3534.0), HTML(value='')))




In [9]:
# making the sentiment to a variable
df_train['sentiment'] = df_train['sentiment'].astype('category')
df_train['code'] = df_train['sentiment'].cat.codes
df_test['sentiment'] = df_test['sentiment'].astype('category')
df_test['code'] = df_test['sentiment'].cat.codes

In [10]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,code
0,cb774db0d1,"[i`d, have, responded,, if, i, were, going]","[i`d, have, responded,, if, i, were, going]",neutral,1
1,549e992a42,"[sooo, sad, i, will, miss, you, here, in, san,...","[sooo, sad]",negative,0
2,088c60f138,"[my, boss, is, bullying, me...]","[bullying, me]",negative,0
3,9642c003ef,"[what, interview!, leave, me, alone]","[leave, me, alone]",negative,0
4,358bd9e861,"[sons, of, ****,, why, couldn`t, they, put, th...","[sons, of, ****,]",negative,0


In [11]:
def unique_words(data):
    """
    Find the number of unique words in the training set.
    """
    words = set()
    for text in data:
        for word in text:
            words.add(word)
    return words

In [12]:
unq_words = unique_words(df_train.text)
len(unq_words)

45433

In [13]:
def create_vocabulary(words):
    vocab2idx, idx = {}, 5
    vocab2idx['<pad>'] = 0
    vocab2idx['<unk>'] = 1
    vocab2idx['<sos>'] = 3
    vocab2idx['<eos>'] = 4
    
    for word in words:
        vocab2idx[word] = idx
        idx += 1
    return vocab2idx

In [14]:
vocab2idx = create_vocabulary(unq_words)
len(vocab2idx)

45437

### Numericalize

In [15]:
def encoding_with_no_padding(sentence, vocab2idx):
    numericalize = [vocab2idx['<sos>']]
    for token in sentence:
        numericalize.append(vocab2idx.get(token, vocab2idx['<unk>']))
    numericalize.append(vocab2idx['<eos>'])
    return numericalize

In [16]:
class tweetDataset(Dataset):
    def __init__(self, data, vocab2idx):
        self.X = [encoding_with_no_padding(x, vocab2idx) for x in data['text']]
        self.y = [encoding_with_no_padding(y, vocab2idx) for y in data['selected_text']]
        self.sentiment = data.code.values
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.y[idx]), self.sentiment[idx]

In [17]:
# sanity check
train_ds = tweetDataset(df_train, vocab2idx)

In [18]:
x, y, s = train_ds[5]
x

tensor([    3,  2283, 34539, 18750, 18013,  3699, 41583,  7089, 16660, 12664,
        24897,  5416,  1355,     4])

In [19]:
s

1

In [20]:
def collate(batch):
    (X, y, s) = zip(*batch)
    x_len = [len(x) for x in X]
    x_pad = pad_sequence(X, batch_first=True, padding_value=0)
    y_pad = pad_sequence(y, batch_first=True, padding_value=0)
    return x_pad, x_len, y_pad, torch.LongTensor(s)

In [21]:
# sanity check
batch_size = 3
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)
x, lengths, y, s = next(iter(train_dl))

### Seq2Seq

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, sentiment):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.sentiment = nn.Embedding(sentiment, hidden_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(.5)
        
    def forward(self, x, lengths, sentiment):
        x = self.dropout(self.vocabs(x))
        x_pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(x)
        return hidden[-1] + self.sentiment(sentiment), cell

In [23]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.vocabs = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden, cell):
        x = self.vocabs(x)
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        return self.linear(hidden[-1]), hidden, cell  #NOTE: hidden[-1] returns everything within that batch

In [25]:
enc = Encoder(len(vocab2idx), 4, 8 3)

In [26]:
h, c = enc(x, lengths, s)

In [28]:
h.size(), c.size()

(torch.Size([3, 8]), torch.Size([1, 3, 8]))

In [31]:
h = h.unsqueeze(0)

In [40]:
x[:, 0].unsqueeze(1)

tensor([[3],
        [3],
        [3]])

In [41]:
dec = Decoder(len(vocab2idx), 4, 8)

In [43]:
x[:, 0].unsqueeze(1).size()

torch.Size([3, 1])

In [49]:
out, hid, cell = dec(x[:, 0].unsqueeze(1), h, c)

In [54]:
y.size()

torch.Size([3, 29])

In [58]:
def train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl, epochs=10):
    for epoch in tqdm(range(epochs)):
        start = time.time()
        total_loss, total = 0, 0
        for x, lengths, y, s in train_dl:
            loss = train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x.to(device), y.to(device), s.to(device), lengths)
            total_loss += loss*x.size(0)
            total += x.size(0)
        print(f"Epoch: {epoch+1} Training Loss: {loss/total_loss:.3f} Time: {time.time()-start:.3f}")

In [59]:
def train_batch(encoder, decoder, enc_optimizer, dec_optimizer, x, y, s, lengths,
                teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    enc_optimizer.zero_grad()
    dec_optimizer.zero_grad()
    loss = 0
    
    # **ENCODER**
    hidden, cell = encoder(x, lengths, s)  # passing both the sequence and the sentiment
    hidden = hidden.unsqueeze(0)
    
    # **DECODER**
    batch_target_length = y.size(1)  # NOTE: the length of the selected text
    decoder_input = x[:, 0].unsqueeze(1)
    
    for idx in range(1, batch_target_length):
        output, hidden, cell = decoder(decoder_input, hidden, cell)
        y_idx = y[:, idx]
        loss += F.cross_entropy(output, y_idx, ignore_index=0)
        teacher_force = True if np.random.uniform() > teacher_forcing_ratio else False
        if teacher_force:
            decoder_input = y_idx.unsqueeze(1)
        else:
            decoder_input = output.argmax(dim=1).unsqueeze(1)
    
    # updating the gradient
    loss.backward()
    enc_optimizer.step()
    dec_optimizer.step()
    return loss.item()

In [46]:
encoder = Encoder(len(vocab2idx), 32, 64, len(df_train.code.unique())).to(device)
decoder = Decoder(len(vocab2idx), 64, 64).to(device)
enc_optimizer = optim.Adam(encoder.parameters(), lr=3e-4)
dec_optimizer = optim.Adam(decoder.parameters(), lr=3e-4)

In [47]:
batch_size = 256
train_ds = tweetDataset(df_train, vocab2idx)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate)

In [60]:
train_model(encoder, decoder, enc_optimizer, dec_optimizer, train_dl)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 1 Training Loss: 0.000 Time: 17.197
Epoch: 2 Training Loss: 0.000 Time: 17.198
Epoch: 3 Training Loss: 0.000 Time: 17.202
Epoch: 4 Training Loss: 0.000 Time: 17.161
Epoch: 5 Training Loss: 0.000 Time: 17.149
Epoch: 6 Training Loss: 0.000 Time: 17.175
Epoch: 7 Training Loss: 0.000 Time: 17.054
Epoch: 8 Training Loss: 0.000 Time: 17.331
Epoch: 9 Training Loss: 0.000 Time: 17.079
Epoch: 10 Training Loss: 0.000 Time: 17.173



### To Do
1. double check the loss -> keep getting 0 loss.
2. check the evaluation by translating the text back out.
3. The dataset only has 27k rows so I originally did not split up the dataset. Create a validation set.