<a href="https://colab.research.google.com/github/dexter11235813/END_1.0/blob/main/assignment_9/model2_TwitterDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator
from torchtext.data import TabularDataset
from torchtext.data import Example, Dataset
import pandas as pd
import spacy
import numpy as np
import time
import re
SEED = 1234

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
path = '/content/drive/MyDrive/Twitter_dataset'

# **Preparing Dataset**

In [None]:
def clean_tweet(tweet):
    # removes @ mentions, hashtags, emojis, twitter reserved words and numbers
    p.set_options(p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER)
    clean = p.clean(tweet)
 
    # transforms every url to "<url>" token and every hashtag to "<hashtag>" token
    p.set_options(p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.HASHTAG, p.OPT.URL)
    clean = p.tokenize(clean)
    clean = re.sub(r'\$HASHTAG\$', '<hashtag>', clean)
    clean = re.sub(r'\$URL\$', '<url>', clean)
 
    # preprocessor doesn't seem to clean all emojis so we run text trough emoji regex to clean leftovers
    clean = re.sub(emoji.get_emoji_regexp(), '', clean)
 
    # removing zero-width character which is often bundled with emojis
    clean = re.sub(u'\ufe0f', '', clean)
 
    # remove multiple empty spaces with one
    clean = re.sub(r' +', ' ', clean)
 
    # replace &gt; and &lt;
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&lt;', '<', clean)
 
    # strip any leftover spaces at the beginning and end
    clean = clean.strip()
 
    return clean

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Twitter_dataset/twcs/twcs.csv')
data.shape

(2811774, 7)

In [None]:
data = data[~data['in_response_to_tweet_id'].isna()]
data.shape

(2017439, 7)

In [None]:
data.isna().sum()

tweet_id                         0
author_id                        0
inbound                          0
created_at                       0
text                             0
response_tweet_id          1040629
in_response_to_tweet_id          0
dtype: int64

In [None]:
%%time
data = data[['tweet_id', 'text', 'in_response_to_tweet_id', 'author_id']]
data_final = pd.merge(data, data, left_on='tweet_id', right_on='in_response_to_tweet_id', how='inner')

CPU times: user 1.65 s, sys: 26.8 ms, total: 1.68 s
Wall time: 1.69 s


In [None]:
assert data_final['tweet_id_x'].astype(float).equals(data_final['in_response_to_tweet_id_y'])

In [None]:
data_final.head()

Unnamed: 0,tweet_id_x,text_x,in_response_to_tweet_id_x,author_id_x,tweet_id_y,text_y,in_response_to_tweet_id_y,author_id_y
0,1,@115712 I understand. I would like to assist y...,3.0,sprintcare,2,@sprintcare and how do you propose we do that,1.0,115712
1,3,@sprintcare I have sent several private messag...,4.0,115712,1,@115712 I understand. I would like to assist y...,3.0,sprintcare
2,4,@115712 Please send us a Private Message so th...,5.0,sprintcare,3,@sprintcare I have sent several private messag...,4.0,115712
3,5,@sprintcare I did.,6.0,115712,4,@115712 Please send us a Private Message so th...,5.0,sprintcare
4,6,@115712 Can you please send us a private messa...,8.0,sprintcare,5,@sprintcare I did.,6.0,115712


In [None]:
data_final = data_final[['text_x', 'text_y', 'author_id_x', 'author_id_y']]
data_final.columns = ['answer', 'question', 'answer_author_id', 'question_author_id']
data_final.head()

Unnamed: 0,answer,question,answer_author_id,question_author_id
0,@115712 I understand. I would like to assist y...,@sprintcare and how do you propose we do that,sprintcare,115712
1,@sprintcare I have sent several private messag...,@115712 I understand. I would like to assist y...,115712,sprintcare
2,@115712 Please send us a Private Message so th...,@sprintcare I have sent several private messag...,sprintcare,115712
3,@sprintcare I did.,@115712 Please send us a Private Message so th...,115712,sprintcare
4,@115712 Can you please send us a private messa...,@sprintcare I did.,sprintcare,115712


In [None]:
data.tweet_id.nunique(), data.in_response_to_tweet_id.nunique(), data_final.shape

(2017439, 1774822, (1118063, 4))

In [None]:
i = 200
print(f"question: {data_final.loc[i]['question']}, \nanswer: {data_final.loc[i].answer}, answer_author_id: {data_final.loc[i].answer_author_id}, question_author_id: {data_final.loc[i].question_author_id}")

question: @115847 @115821 Yeah idk the shipping company and it says “mike Ziemer” signed for it in the mail room at our building lol, 
answer: @115846 @115821 Remember my head phones and they brought them within minutes, answer_author_id: 115847, question_author_id: 115846


In [None]:
data_final.head()

Unnamed: 0,answer,question,answer_author_id,question_author_id
0,@115712 I understand. I would like to assist y...,@sprintcare and how do you propose we do that,sprintcare,115712
1,@sprintcare I have sent several private messag...,@115712 I understand. I would like to assist y...,115712,sprintcare
2,@115712 Please send us a Private Message so th...,@sprintcare I have sent several private messag...,sprintcare,115712
3,@sprintcare I did.,@115712 Please send us a Private Message so th...,115712,sprintcare
4,@115712 Can you please send us a private messa...,@sprintcare I did.,sprintcare,115712


In [None]:
%%time
data_final['answer'] = data_final['answer'].apply(clean_tweet)
#print('finshed with answers, transforming questions ...')
data_final['question'] = data_final['question'].apply(clean_tweet)

CPU times: user 24min 8s, sys: 136 ms, total: 24min 8s
Wall time: 24min 8s


In [None]:
from sklearn.model_selection import train_test_split
train_ds, valid_ds = train_test_split(data_final, test_size=0.3)
train_ds = train_ds.reset_index(drop=True)
valid_ds = valid_ds.reset_index(drop=True)

In [None]:
train_ds.to_csv(f'{path}/train_ds.csv', index=False)
valid_ds.to_csv(f'{path}/valid_ds.csv', index=False)

In [None]:
# top_answer_ids = tds.answer_author_id.value_counts().head(50).index
# tds = tds[tds.answer_author_id.isin(top_answer_ids)]
# vds = vds[vds.answer_author_id.isin(top_answer_ids)]
# tds.to_csv(f'{path}/train_ds_filtered.csv', index=False)
# vds.to_csv(f'{path}/valid_ds_filtered.csv', index=False)

# **Modelling**

In [None]:
# x = pd.read_csv(f'{path}/train_ds_filtered.csv')
# y = pd.read_csv(f'{path}/valid_ds_filtered.csv')
# x.shape, y.shape

((352879, 4), (150630, 4))

In [None]:
# x_trunc = x.iloc[list(np.random.choice(len(x),100000, replace=False))]
# y_trunc = y.iloc[list(np.random.choice(len(y), 50000, replace=False))]
# x_trunc.shape, y_trunc.shape

((100000, 4), (50000, 4))

In [None]:
# x_trunc[['question', 'answer']].to_csv(f'{path}/train_ds_100k.csv', index=False)
# y_trunc[['question', 'answer']].to_csv(f'{path}/valid_ds_50k.csv', index=False)

In [3]:
spacy_en = spacy.load('en')
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
QUESTION = Field(
                   tokenize = tokenize_en,
                   lower = True,
                   init_token = '<sos>', 
                   eos_token = '<eos>', 
                   )

ANSWER = Field(tokenize =tokenize_en,
                     lower = True,
                      init_token = '<sos>', 
                      eos_token = '<eos>')

fields = [('question', QUESTION), ('answer', ANSWER)]

In [5]:
%%time
train_ds, valid_ds = TabularDataset.splits(
   path = path,
   train = 'train_ds_100k.csv',
   validation = 'valid_ds_50k.csv',
   format = 'csv',
   fields = fields,
   skip_header = True
)


CPU times: user 39.2 s, sys: 556 ms, total: 39.8 s
Wall time: 39.8 s


In [6]:
%%time
QUESTION.build_vocab(train_ds, min_freq=3)
ANSWER.build_vocab(train_ds, min_freq=3)

CPU times: user 1.41 s, sys: 3.03 ms, total: 1.41 s
Wall time: 1.42 s


In [7]:
BATCH_SIZE = 64
device = 'cuda'
train_iterator, valid_iterator = BucketIterator.splits(
    (train_ds, valid_ds),
    sort_key=lambda x: len(x.answer), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=True,
    batch_size=BATCH_SIZE,
    # repeat=False, # we pass repeat=False because we want to wrap this Iterator layer
    device = device)

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        #print(f'\nsrc shape from s2s forward : {src.shape}, trg shape from s2s forward: {trg.shape}')
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = np.random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden


class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):

        
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]

        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]

        #print(f'embedded: {embedded.shape}, weighted :{weighted.shape}')
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

In [10]:
INPUT_DIM = len(QUESTION.vocab)
OUTPUT_DIM = len(ANSWER.vocab)
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
ENC_HID_DIM = 128
DEC_HID_DIM = 128
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [11]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(14182, 64)
    (rnn): GRU(64, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(12177, 64)
    (rnn): GRU(320, 128)
    (fc_out): Linear(in_features=448, out_features=12177, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [12]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = ANSWER.vocab.stoi[ANSWER.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [13]:

from functools import partial
from tqdm import tqdm

tqdm = partial(tqdm, leave=True, position=0)
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
        
        src = batch.question
        trg = batch.answer
        
        #print(f'\nsrc shape from train : {src.shape}, trg shape from train: {trg.shape}')
        optimizer.zero_grad()
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in tqdm(enumerate(iterator), total=len(iterator)):

            src = batch.question
            trg = batch.answer

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut3-model.pt')
        print(f'best loss is {best_valid_loss} now')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')

100%|██████████| 3125/3125 [11:09<00:00,  4.67it/s]
100%|██████████| 1563/1563 [01:22<00:00, 18.88it/s]
  0%|          | 0/3125 [00:00<?, ?it/s]

best loss is 6.112677135150248 now
Epoch: 01 | Time: 12m 32s
	Train Loss: 5.110 | Train PPL: 165.654
	 Val. Loss: 6.113 |  Val. PPL: 451.546


100%|██████████| 3125/3125 [11:07<00:00,  4.68it/s]
100%|██████████| 1563/1563 [01:21<00:00, 19.06it/s]
  0%|          | 0/3125 [00:00<?, ?it/s]

Epoch: 02 | Time: 12m 29s
	Train Loss: 4.613 | Train PPL: 100.764
	 Val. Loss: 6.124 |  Val. PPL: 456.906


  9%|▉         | 294/3125 [01:02<10:50,  4.35it/s]