In [1]:
import os
import numpy as np
import pandas as pd
import nltk.tokenize
import re
import random
from nltk.util import ngrams
import tqdm
from nltk.tokenize import RegexpTokenizer
import torch

### Load Data

In [2]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pack_sequence, pad_packed_sequence
from torchtext.data.utils import get_tokenizer
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
def collate_fn(data):
    data.sort(key=lambda x: len(x[0]), reverse=True)
    text_data = []
    target_data = []
    for unit in data:
        text_data.append(torch.tensor(unit[0]))
        target_data.append(torch.tensor(unit[1]))
    text = pad_sequence(text_data, batch_first=True)
    target = pad_sequence(target_data, batch_first=True)
    return text, target

In [4]:
loader_path = "LSTM_test_data/test_data_loader.pth"
vocab_path = "LSTM_test_data/text_vocab.pth"
test_data_loader = torch.load(loader_path)
text_vocab = torch.load(vocab_path)

BiLSTM model

In [5]:
class Encoder(torch.nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, dec_hidden_dim, num_layers,dropout=0.5):
        super().__init__()
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.num_layers=num_layers
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, dec_hidden_dim)

        self.layer=nn.LSTM(input_size=emb_dim,hidden_size=hidden_dim, \
                        num_layers=num_layers,batch_first=True, \
                        dropout=dropout,bidirectional=True)
    
    def forward(self,x):
        batch_size = x.shape[0]
        
        embedded = self.dropout(self.embedding(x))     
        
        out,(hidden,c)=self.layer(embedded)
        
        s = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
        
        return out, s

Attention Mechanism

In [6]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        # [size(h_t)+size(s_{t-1}), dec_hid_dim]
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim, bias=False)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, s, enc_output):
        # s = [batch_size, dec_hid_dim]
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]

        batch_size = enc_output.shape[0]
        src_len = enc_output.shape[1]

        # repeat decoder hidden state src_len times
        # s = [batch_size, src_len, enc_hid_dim * 2]
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        s = s.unsqueeze(1).repeat(1, src_len, 1)

        # energy = [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn(torch.cat((s, enc_output), dim=2)))

        # attention = [batch_size, src_len]
        attention = self.v(energy).squeeze(2)

        return F.softmax(attention, dim=1)

In [7]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention, device):
        super().__init__()
        self.output_dim = output_dim
        self.dec_hid_dim = dec_hid_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.device = device
        
        self.layer=nn.LSTM(input_size=enc_hid_dim * 2 + emb_dim, hidden_size=dec_hid_dim, \
                        num_layers=1,batch_first=True, \
                        dropout=dropout,bidirectional=False)
        
        self.fc_out = nn.Linear(enc_hid_dim * 2 + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, dec_input, s, enc_output):
        # dec_input = [batch_size]
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim *2]
        
        batch_size = dec_input.shape[0]

        # dec_input = [batch_size,1]
        dec_input = dec_input.unsqueeze(1)

        # embedded = [batch_size, 1, emb_dim]
        embedded = self.dropout(self.embedding(dec_input))

        # s = [batch_size, dec_hid_dim]
        # enc_output = [batch_size, src_len, enc_hid_dim *2]

        # a = [batch_size, 1, src_len]
        a = self.attention(s, enc_output).unsqueeze(1)

        # c = [batch_size, 1, enc_hid_dim * 2]
        c = torch.bmm(a, enc_output)

        # lstm_input = [batch_size, 1, (enc_hid_dim*2) + emb_dim]
        lstm_input = torch.cat((embedded, c), dim=2)
        
        c0 = torch.randn(1, batch_size, self.dec_hid_dim).to(self.device)

        # dec_output = [batch_size, src_len(=1), dec_hid_dim]
        # dec_hidden = [n_layers*num_directions, batch_size, dec_hid_dim]
        dec_output, (dec_hidden, _) = self.layer(lstm_input, (s.unsqueeze(0), c0))

        # embedded = [batch_size, emb_dim]
        # dec_output = [batch_size, dec_hid_dim]
        # c = [batch_size, enc_hid_dim * 2]
        embedded = embedded.squeeze(1)
        dec_output = dec_output.squeeze(1)
        c = c.squeeze(1)

        # pred = [batch_size, output_dim]
        pred = self.fc_out(torch.cat((dec_output, c, embedded), dim=1))

        return pred, dec_hidden.squeeze(0)


In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.device = device
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [batch_size, text_length]
        # trg = [batch_size, summarizarion_length]
        # teacher_forcing_ratio is probability to use teacher forcing (scheduled sampling)
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, vocab_size).to(self.device)

        # enc_output : [src_len, batch_size, enc_hid_dim * 2]
        # s : [batch_size, dec_hid_dim]
        enc_output, s = self.encoder(src)

        # first input to the decoder is the <bob> tokens
        dec_input = trg[:, 0]

        for t in range(1, trg_len):
            dec_output, s = self.decoder(dec_input, s, enc_output)

            outputs[t] = dec_output

            # decide if using teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from predictions
            prediction = dec_output.argmax(1)

            # if teacher forcing, use actural next token as input
            # if not, use predicted token
            dec_input = trg[:, t] if teacher_force else prediction

        return outputs


In [9]:
# Define Hyper parameter

INPUT_DIM = len(text_vocab)
OUTPUT_DIM = len(text_vocab)
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

device = "cuda"

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, 2, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn, device)

model = Seq2Seq(enc, dec, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)



### Reload a Trained Model

In [10]:
model_dir = 'LSTM_model/lstm_model.pt'
model.load_state_dict(torch.load(model_dir))

<All keys matched successfully>

### Test Model

In [13]:
model.eval()
pre_list = []
highlight_list = []
article_list = []
with torch.no_grad():
    for i, batch in tqdm.tqdm(enumerate(test_data_loader)):
        text, highlight = batch
            
        text = text.to(device)
        highlight = highlight.to(device)

        # output = [highlight_len, batch_size, output_dim]

        output = model(text, highlight, 0) # turn off teacher forcing

        # output = [batch_size, highlight_len, output_dim]
        output = output.permute([1,0,2])

        batch_size = highlight.shape[0]
        for j in range(batch_size):
            highlight_text = ""
            pre_text = ""
            article = ""
            for num_high in list(highlight[j]):
                highlight_text += text_vocab.lookup_token(num_high)+" "
            for num_at in list(text[j]):
                article += text_vocab.lookup_token(num_at)+" "
            for num_pre in list(output[j].argmax(1)):
                pre_text += text_vocab.lookup_token(num_pre)+" "
            pre_list.append(pre_text.replace("<pad>", '').replace("<bos>", '').replace("<eos>", ''))
            highlight_list.append(highlight_text.replace("<pad>", '').replace("<bos>", '').replace("<eos>", ''))
            article_list.append(article.replace("<pad>", '').replace("<bos>", '').replace("<eos>", ''))
        

991it [10:26,  1.58it/s]


In [14]:
import nltk.tokenize
import re
import random
from nltk.util import ngrams
import tqdm
from nltk.tokenize import RegexpTokenizer
from bert_score import score

# Import test function 
import test_baseline

In [18]:
test_tokenizer = RegexpTokenizer(r'\w+')

r_1 = 0
r_2 = 0
n = 2
for i in tqdm.tqdm(range(len(pre_list))):
    predict_tokens = test_tokenizer.tokenize(pre_list[i])
    reference_tokens = test_tokenizer.tokenize(highlight_list[i])
    r_1 += test_baseline.rouge_1(predict_tokens, reference_tokens)
    r_2 += test_baseline.rouge_n(predict_tokens, reference_tokens, n)
    
P, R, F1  = score(pre_list, highlight_list, lang = "en", verbose = True)

100%|████████████████████████████████████████████████████████████████████████████| 1982/1982 [00:00<00:00, 4435.96it/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/58 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/31 [00:00<?, ?it/s]

done in 67.24 seconds, 29.47 sentences/sec


In [19]:
print("ROUGE-1 Score: ", r_1)
print("ROUGE-2 Score: ", r_1)

ROUGE-1 Score:  1609.4108270775428
ROUGE-2 Score:  1609.4108270775428


In [20]:
print("BERT Precision: ", torch.sum(P))
print("BERT Recall: ", torch.sum(R))
print("BERT F-1 Score: ", torch.sum(F1))

BERT Precision:  tensor(1584.3958)
BERT Recall:  tensor(1620.7644)
BERT F-1 Score:  tensor(1602.1873)


In [25]:
pre_list[145]

' President Obama , " the first gay president , " zig - <unk> on gay marriage over time   .   In 1996 , he endorses same - sex marriage in a survey   .   But in 2011 , a White House adviser says someone else filled out that survey   .   Today , he is the first sitting president to endorse same - sex marriage .  '

In [26]:
article_list[145]

' He has been declared America \'s " first gay president . "   But President Barack Obama \'s evolution to that title has n\'t been easy . His positions zig - <unk> over almost two decades .   His advocacy of same - sex marriage began well before his White House years , tracing back to his early political service in Illinois . The effectiveness of his leadership , however , will be determined by the U.S. Supreme Court as it considers a California ban on same - sex marriage .   1996 : While running for the Illinois Senate , Obama signs a questionnaire for a gay Chicago publication saying he favors legalizing same - sex marriages . He later wins the race .   1998 : He alters course and answers " undecided " on same - sex marriage when questioned in another survey .   2003 : In his campaign for the Illinois Senate , Obama says in a questionnaire that he is against repealing the Defense of Marriage Act , a 1996 federal law that states for federal purposes , marriage is defined as only betw