In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '6'

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import tqdm

# Load

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df, val_df = model_selection.train_test_split(train_df, test_size=0.2, random_state=42)
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

In [3]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,7aef27c516,Doctor Who has finished,Doctor Who has finished,neutral
1,415660cb0e,you should.,you should.,neutral
2,4fdc228bbe,"back at school again. almost weekend. oh wait,...","back at school again. almost weekend. oh wait,...",neutral
3,ea91e5a7ab,My computer is SO slooowww this morning. I th...,My computer is SO slooowww this morning. I th...,neutral
4,8f2022b87f,On my way to dazzle bar!!,On my way to dazzle bar!!,neutral


In [5]:
x = tokenizer.encode(train_df.text[3], add_special_tokens=False)

In [6]:
y = tokenizer.encode(train_df.selected_text[3], add_special_tokens=False)

In [7]:
tokenizer.decode(x)

'my computer is so slooowww this morning. i think it ` s a sign that i should go home and play in my yard.'

# Preprocessing

In [8]:
def sentiment_encoder(x):
    if x == 'negative': return 0
    elif x == 'neutral': return 1
    else: return 2

In [9]:
train_df.sentiment = train_df.sentiment.apply(sentiment_encoder)
val_df.sentiment = val_df.sentiment.apply(sentiment_encoder)

In [10]:
def standardize(x):
    t, st = x
    start_idx = t.index(st)
    end_idx = start_idx + len(st) - 1
    if st == 0: return st
    while start_idx > 0 and t[start_idx-1] != ' ':
        start_idx -= 1
    return  t[start_idx:end_idx+1]

In [11]:
x = ('This is a test', 's a test')
standardize(x) # Should return 'is a test'

'is a test'

In [12]:
train_df.text = train_df.text.astype('str')
train_df.selected_text = train_df.selected_text.astype('str')
val_df.text = val_df.text.astype('str')
val_df.selected_text = val_df.selected_text.astype('str')

In [13]:
train_df['both_text'] = list(zip(train_df.text, train_df.selected_text))
val_df['both_text'] = list(zip(val_df.text, val_df.selected_text))

In [14]:
train_df['selected_text2'] = train_df['both_text'].apply(standardize)
val_df['selected_text2'] = val_df['both_text'].apply(standardize)

### Affected val set

In [15]:
(val_df.selected_text == val_df.selected_text2).sum()

5198

In [16]:
val_df.shape

(5497, 6)

### Tokenize Text

In [17]:
train_df['tok_text'] = train_df.text.apply(lambda x: tokenizer.encode(x, add_special_tokens=False))
val_df['tok_text'] = val_df.text.apply(lambda x: tokenizer.encode(x, add_special_tokens=False))

In [18]:
train_df['tok_selected_text'] = train_df.selected_text2.apply(lambda x: tokenizer.encode(x, add_special_tokens=False))
val_df['tok_selected_text'] = val_df.selected_text2.apply(lambda x: tokenizer.encode(x, add_special_tokens=False))

In [19]:
max([len(x) for x in train_df.tok_text])

108

In [20]:
max([len(x) for x in train_df.tok_selected_text])

108

In [21]:
train_df['both_text'] = list(zip(train_df.tok_text, train_df.tok_selected_text))
val_df['both_text'] = list(zip(val_df.tok_text, val_df.tok_selected_text))

In [22]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,both_text,selected_text2,tok_text,tok_selected_text
0,7aef27c516,Doctor Who has finished,Doctor Who has finished,1,"([3460, 2040, 2038, 2736], [3460, 2040, 2038, ...",Doctor Who has finished,"[3460, 2040, 2038, 2736]","[3460, 2040, 2038, 2736]"
1,415660cb0e,you should.,you should.,1,"([2017, 2323, 1012], [2017, 2323, 1012])",you should.,"[2017, 2323, 1012]","[2017, 2323, 1012]"
2,4fdc228bbe,"back at school again. almost weekend. oh wait,...","back at school again. almost weekend. oh wait,...",1,"([2067, 2012, 2082, 2153, 1012, 2471, 5353, 10...","back at school again. almost weekend. oh wait,...","[2067, 2012, 2082, 2153, 1012, 2471, 5353, 101...","[2067, 2012, 2082, 2153, 1012, 2471, 5353, 101..."
3,ea91e5a7ab,My computer is SO slooowww this morning. I th...,My computer is SO slooowww this morning. I th...,1,"([2026, 3274, 2003, 2061, 22889, 9541, 5004, 2...",My computer is SO slooowww this morning. I th...,"[2026, 3274, 2003, 2061, 22889, 9541, 5004, 28...","[2026, 3274, 2003, 2061, 22889, 9541, 5004, 28..."
4,8f2022b87f,On my way to dazzle bar!!,On my way to dazzle bar!!,1,"([2006, 2026, 2126, 2000, 4830, 17644, 3347, 9...",On my way to dazzle bar!!,"[2006, 2026, 2126, 2000, 4830, 17644, 3347, 99...","[2006, 2026, 2126, 2000, 4830, 17644, 3347, 99..."


### Get start and end index for tokenized text

In [23]:
def get_indexes(x):
    t, st = x
    for i, j in enumerate(t):
        if t[i:i+len(st)] == st:
            return i, i+len(st)
    return -1, -1

In [24]:
train_indexes = train_df.both_text.apply(get_indexes)
val_indexes = val_df.both_text.apply(get_indexes)

In [25]:
train_df['start_idx'] = train_indexes.map(lambda x: x[0])
train_df['end_idx'] = train_indexes.map(lambda x: x[1])
val_df['start_idx'] = val_indexes.map(lambda x: x[0])
val_df['end_idx'] = val_indexes.map(lambda x: x[1])

### For test set, if it's -1, will just predict the whole text

In [26]:
train_df = train_df.loc[train_df.start_idx!=-1]

In [27]:
val_df = val_df.loc[val_df.start_idx!=-1]

In [28]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

In [29]:
def pad(x):
    return x + ([0] * (108-len(x)))

In [30]:
train_df['tok_text'] = train_df['tok_text'].apply(pad)
val_df['tok_text'] = val_df['tok_text'].apply(pad)

# Dataset

In [31]:
class Tweets(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        
    def __len__(self): return len(self.df)
        
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        return {'text': row.text, 'selected_text': row.selected_text, 'tok_text': torch.Tensor(row.tok_text), 
                'tok_selected_text': torch.Tensor(row.tok_selected_text), 
                'start_idx': row.start_idx, 'end_idx': row.end_idx,
                'sentiment': row.sentiment}

In [36]:
train_ds = Tweets(train_df)
val_ds = Tweets(val_df)

In [37]:
train_ds[0]

{'text': 'Doctor Who has finished',
 'selected_text': 'Doctor Who has finished',
 'tok_text': tensor([3460., 2040., 2038., 2736.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
            0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.]),
 'tok_selected_text': tensor([3460., 2040., 2038.,

In [39]:
train_dl = DataLoader(train_ds, shuffle=True, num_workers=4)
val_dl = DataLoader(val_ds, num_workers=4)

# Model

In [32]:
class BERT(nn.Module):
    def __init__(self, sent_v=3, sent_emb=50, p=0.5):
        super().__init__()
        self.sent_emb = nn.Embedding(sent_v, sent_emb)
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.start = nn.Linear(108*768+sent_emb, 1)
        self.end = nn.Linear(108*768+sent_emb, 1)
        self.drop = nn.Dropout(p=p)
        self.relu = nn.ReLU()
        
    def forward(self, x, sent):
        sent = self.sent_emb(sent)
        sent = self.drop(sent)
        x = self.bert(x)[0]
        x = x.view(x.shape[0], -1)
        x = self.drop(x)
        x = torch.cat((x, sent), dim=1)
        x = self.relu(x)
        return self.start(x), self.end(x)

In [41]:
batch = next(iter(train_dl))

In [42]:
batch

{'text': ['http://tinyurl.com/oqsqz6 Grace`s FunZen magic mood tool for keeping her cool in the pool of real life which is now yours too'],
 'selected_text': ['for keeping her cool in the pool'],
 'tok_text': tensor([[ 8299.,  1024.,  1013.,  1013.,  4714.,  3126.,  2140.,  1012.,  4012.,
           1013.,  1051.,  4160.,  2015.,  4160.,  2480.,  2575.,  4519.,  1036.,
           1055.,  4569., 10431.,  3894.,  6888.,  6994.,  2005.,  4363.,  2014.,
           4658.,  1999.,  1996.,  4770.,  1997.,  2613.,  2166.,  2029.,  2003.,
           2085.,  6737.,  2205.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     

In [43]:
model = BERT()

In [44]:
model(batch['tok_text'].long(), batch['sentiment'].long())

(tensor([[-0.5272]], grad_fn=<AddmmBackward>),
 tensor([[0.1233]], grad_fn=<AddmmBackward>))

# Training

In [35]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [46]:
model = BERT()

In [47]:
model.to(device)

BERT(
  (sent_emb): Embedding(3, 50)
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [48]:
train_dl = DataLoader(train_ds, shuffle=True, num_workers=4)
val_dl = DataLoader(val_ds, num_workers=4)

In [50]:
def train(model, optimizers, dl, loss_fn=nn.MSELoss()):
    model.train()
    for batch in tqdm.tqdm(dl):
        x = batch['tok_text'].to(device)
        sent = batch['sentiment'].to(device)
        start_idx = batch['start_idx'].to(device)
        end_idx = batch['end_idx'].to(device)
        s, e = model(x.long(), sent.long())
        loss = loss_fn(start_idx, s)
        loss += loss_fn(end_idx, e)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss.item()/len(dl)

In [51]:
def val(model, dl, loss_fn=nn.MSELoss()):
    model.eval()
    for batch in tqdm.tqdm(dl):
        x = batch['tok_text'].to(device)
        sent = batch['sentiment'].to(device)
        start_idx = batch['start_idx'].to(device)
        end_idx = batch['end_idx'].to(device)
        s, e = model(x.long(), sent.long())
        loss = loss_fn(start_idx, s)
        loss += loss_fn(end_idx, e)
    pred = batch['tok_selected_text'][0][int(s):int(e)]
    pred = tokenizer.decode(pred)
    print(f'Text: {batch["text"]}, Selected: {batch["selected_text"]}, Pred: {pred}')
    return loss.item()/len(dl)

In [58]:
for param in model.parameters():
#     print(param)
    param.requires_grad = False

In [59]:
model

BERT(
  (sent_emb): Embedding(3, 50)
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [63]:
model.start.weight.requires_grad = True
model.end.weight.requires_grad = True

In [64]:
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), weight_decay=1e-4)

In [66]:
best_loss = float('inf')
for i in range(1, 4):
    tl = train(model, optimizer, train_dl)
    vl = val(model, val_dl)
    print(f'Epoch {i}: Train Loss: {tl}, Val Loss: {vl}')
    if vl < best_loss:
        print('Save')

  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 21466/21466 [07:40<00:00, 46.66it/s]
100%|██████████| 5353/5353 [01:42<00:00, 52.22it/s]
  0%|          | 0/21466 [00:00<?, ?it/s]

Text: ['watching a movie and rocking my baby kitty who is asleep in my sling hanging from my neck. So cute'], Selected: ['watching a movie and rocking my baby kitty who is asleep in my sling hanging from my neck. So cute'], Pred: from my neck. so cute
Epoch 1: Train Loss: 0.020673303471732914, Val Loss: 0.07333659074606237
Save


100%|██████████| 21466/21466 [07:36<00:00, 47.04it/s]
100%|██████████| 5353/5353 [01:40<00:00, 53.09it/s]
  0%|          | 0/21466 [00:00<?, ?it/s]

Text: ['watching a movie and rocking my baby kitty who is asleep in my sling hanging from my neck. So cute'], Selected: ['watching a movie and rocking my baby kitty who is asleep in my sling hanging from my neck. So cute'], Pred: movie and rocking my
Epoch 2: Train Loss: 0.00903468877063219, Val Loss: 0.04184654915686881
Save


100%|██████████| 21466/21466 [07:38<00:00, 46.86it/s]
100%|██████████| 5353/5353 [01:41<00:00, 52.68it/s]

Text: ['watching a movie and rocking my baby kitty who is asleep in my sling hanging from my neck. So cute'], Selected: ['watching a movie and rocking my baby kitty who is asleep in my sling hanging from my neck. So cute'], Pred: from my neck. so
Epoch 3: Train Loss: 0.022647665385085718, Val Loss: 0.04322066967139484
Save





In [67]:
torch.save(model.state_dict(), 'local_models/bert_v1.pth')

# Jaccard

In [33]:
val_ds = Tweets(val_df)
val_dl = DataLoader(val_ds, shuffle=False)

In [36]:
model = BERT()
model.to(device)
model.load_state_dict(torch.load('local_models/bert_v1.pth'))

<All keys matched successfully>

In [37]:
def val_pred(model, dl, loss_fn=nn.MSELoss()):
    model.eval()
    output = []
    for batch in tqdm.tqdm(dl):
        x = batch['tok_text'].to(device)
        sent = batch['sentiment'].to(device)
        s, e = model(x.long(), sent.long())
        pred = batch['tok_text'][0][int(s):int(e)]
        pred = [i for i in pred if i != 0]
        pred = tokenizer.decode(pred)
        output.append(pred)
    return output

In [38]:
output = val_pred(model, val_dl)

100%|██████████| 5353/5353 [02:18<00:00, 38.78it/s]


In [39]:
val_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,both_text,selected_text2,tok_text,tok_selected_text,start_idx,end_idx
0,a7f72a928a,WOOOOOOOOOO are you coming to Nottingham at...,t? lovelovelove,2,"([15854, 9541, 9541, 9541, 9541, 2024, 2017, 2...",point? lovelovelove,"[15854, 9541, 9541, 9541, 9541, 2024, 2017, 27...","[2391, 1029, 2293, 14301, 18349, 3726]",12,18
1,ef42dee96c,resting had a whole day of walking,resting had a whole day of walking,1,"([8345, 2018, 1037, 2878, 2154, 1997, 3788], [...",resting had a whole day of walking,"[8345, 2018, 1037, 2878, 2154, 1997, 3788, 0, ...","[8345, 2018, 1037, 2878, 2154, 1997, 3788]",0,7
2,07d17131b1,"was in Palawan a couple of days ago, i`ll try ...","was in Palawan a couple of days ago, i`ll try ...",1,"([2001, 1999, 14412, 25903, 1037, 3232, 1997, ...","was in Palawan a couple of days ago, i`ll try ...","[2001, 1999, 14412, 25903, 1037, 3232, 1997, 2...","[2001, 1999, 14412, 25903, 1037, 3232, 1997, 2...",0,19
3,2820205db5,I know! I`m so slow its horrible. DON`T TELL ...,horrible.,0,"([1045, 2113, 999, 1045, 1036, 1049, 2061, 403...",horrible.,"[1045, 2113, 999, 1045, 1036, 1049, 2061, 4030...","[9202, 1012]",9,11
4,7d3ce4363c,"Glad I went out, glad I didn`t leave early, an...",glad,2,"([5580, 1045, 2253, 2041, 1010, 5580, 1045, 21...",glad,"[5580, 1045, 2253, 2041, 1010, 5580, 1045, 213...",[5580],0,1


In [41]:
val_df['pred_sel_text'] = output

In [42]:
val_df['both_pred'] = list(zip(val_df.pred_sel_text, val_df.selected_text))

In [43]:
def jaccard(s1, s2):
    a = set(s1.lower().split()) 
    b = set(s2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [45]:
val_df['jaccard'] = val_df.both_pred.apply(lambda x: jaccard(*x))

In [46]:
val_df.jaccard.mean()

0.2379280405649789