# Library and Dataset

In [1]:
%%capture
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install bert_score

In [None]:
import re
import os
import pandas as pd
import numpy as np
import random

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
import transformers
import datasets

import evaluate
from evaluate import load

In [None]:
CNN_dataset_train = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:2%]")
CNN_dataset_val = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:5%]")
CNN_dataset_test = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:5%]")

# Preprocessing and Tokenization

In [4]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}
stop_words = stopwords.words('english')

In [5]:
def clean_data(text):
  text = text.lower()
  text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])

  ### remove stopwords
  # text = text.split()
  # newtext = []
  # for word in text:
  #   if word not in stop_words:
  #     newtext.append(word)
  # text = " ".join(newtext)

  text=re.sub(r'\n',' ', text)
  text=re.sub(r'>',' ', text)
  text=re.sub(r'<',' ', text)
  text=re.sub(r'LRB',' ', text)
  text=re.sub(r'RRB',' ', text)
  text = re.sub(r'[" "]+', " ", text)
  text=re.sub(r'-- ',' ', text)
  text=re.sub(r"([?!¿])", r" \1 ", text)
  text=re.sub(r'-',' ', text)
  text=text.replace('/',' ')
  text=re.sub(r'\s+', ' ', text)
  text = re.sub('[^A-Za-z0-9.,]+', ' ', text)
  return text

In [6]:
n = 10000

train_article = []
train_article.extend(CNN_dataset_train["article"])
#train_article.extend(XSum_dataset["train"][0:n]["document"])
train_summary = []
train_summary.extend(CNN_dataset_train["highlights"])
#train_summary.extend(XSum_dataset["train"][0:n]["summary"])

dataset = pd.DataFrame()
dataset["article"] = train_article
dataset["highlights"] = train_summary

In [7]:
dataset["article"] = dataset["article"].apply(clean_data)
dataset["highlights"] = dataset["highlights"].apply(clean_data)
dataset

Unnamed: 0,article,highlights
0,"london, england reuters harry potter star dani...",harry potter star daniel radcliffe gets 20m fo...
1,"editor s note in our behind the scenes series,...",mentally ill inmates in miami are housed on th...
2,"minneapolis, minnesota cnn drivers who were on...","new i thought i was going to die, driver says ..."
3,washington cnn doctors removed five small poly...,five small polyps found during procedure none ...
4,cnn the national football league has indefini...,"new nfl chief, atlanta falcons owner critical ..."
...,...,...
5737,"miami, florida cnn forecasters issued a tropic...",bermuda on alert as hurricane bill gets closer...
5738,"kabul, afghanistan cnn afghanistan officials s...","26 people killed in election day violence, afg..."
5739,cnn ten climbers have died on two mountains i...,10 climbers have died on two mountains in nort...
5740,cnn kids dig in the sand at the beach all the...,"boy, 11, was digging a tunnel with friends on ..."


In [8]:
x = dataset['article']
y = dataset['highlights']

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [11]:
def readLangs(text, summary, reverse=False):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(summary)
        output_lang = Lang(text)
    else:
        input_lang = Lang(text)
        output_lang = Lang(summary)

    return input_lang, output_lang, pairs

In [12]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [13]:
input_lang, output_lang, pairs = prepareData( x, y , False)
print(random.choice(pairs))

Reading lines...
Read 5742 sentence pairs
Counting words...
Counted words:
0       london, england reuters harry potter star dani...
1       editor s note in our behind the scenes series,...
2       minneapolis, minnesota cnn drivers who were on...
3       washington cnn doctors removed five small poly...
4        cnn the national football league has indefini...
                              ...                        
5737    miami, florida cnn forecasters issued a tropic...
5738    kabul, afghanistan cnn afghanistan officials s...
5739     cnn ten climbers have died on two mountains i...
5740     cnn kids dig in the sand at the beach all the...
5741     cnn eight time gold medal winner at the beiji...
Name: article, Length: 5742, dtype: object 104240
0       harry potter star daniel radcliffe gets 20m fo...
1       mentally ill inmates in miami are housed on th...
2       new i thought i was going to die, driver says ...
3       five small polyps found during procedure none ...
4    

In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Model Architecture

In [15]:
MAX_LENGTH = 1500

In [16]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [17]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [18]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Training

In [19]:
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [20]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [21]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        if input_tensor.size(0) > 1500:
          continue
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [22]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [23]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [34]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [35]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 20000, print_every=1000)

23m 2s (- 437m 40s) (1000 5%) 5.3378
46m 19s (- 416m 56s) (2000 10%) 6.7561
69m 29s (- 393m 47s) (3000 15%) 6.9559
92m 15s (- 369m 3s) (4000 20%) 6.8865
115m 55s (- 347m 45s) (5000 25%) 6.9107
138m 40s (- 323m 34s) (6000 30%) 6.7843
162m 15s (- 301m 20s) (7000 35%) 6.7713
184m 47s (- 277m 10s) (8000 40%) 6.6752
208m 0s (- 254m 13s) (9000 45%) 6.5832
231m 3s (- 231m 3s) (10000 50%) 6.6425
253m 39s (- 207m 32s) (11000 55%) 6.5072
276m 58s (- 184m 39s) (12000 60%) 6.4818
299m 54s (- 161m 29s) (13000 65%) 6.5327
322m 58s (- 138m 24s) (14000 70%) 6.4218
345m 48s (- 115m 16s) (15000 75%) 6.3960
368m 45s (- 92m 11s) (16000 80%) 6.3305
392m 2s (- 69m 10s) (17000 85%) 6.2985
414m 21s (- 46m 2s) (18000 90%) 6.3019
437m 44s (- 23m 2s) (19000 95%) 6.2344
461m 1s (- 0m 0s) (20000 100%) 6.2194


In [36]:
evaluateRandomly(encoder1, attn_decoder1)

>  instyle.com the co star of 27 dresses discusses his personal style. i just feel dirty, says james marsden. not to be alarmed there is nothing indecent going on here. james marsden talks about his favorite kind of clothes and how his style has changed over the years. the 34 year old is explaining what it feels like to be sporting facial hair for a film he is working on with cameron diaz. yet even the newly grown whiskers cannot hide the finely chiseled features of an actor who seems to have cornered the hollywood market on hottie who loses the girl roles enchanted, superman returns and, of course, the notebook. but the oklahoma bred star does not mind his wholesome image. that s really me, he says. i m a little dorky awkward. however, his onscreen luck could be changing with his latest film, 27 dresses, a romantic comedy co starring katherine heigl. scruffy or not, we will be watching. you have played a prince in enchanted and the superhero cyclops in the x men movies. is that a big 

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
cd /content/drive/MyDrive/ECE1786

/content/drive/MyDrive/ECE1786


In [37]:
torch.save(encoder1.state_dict(), 'encoder.pt')
torch.save(attn_decoder1.state_dict(), 'decoder.pt')

In [None]:
encoder1.load_state_dict(torch.load('encoder.pt'))
attn_decoder1.load_state_dict(torch.load('decoder.pt'))
encoder1.eval()
attn_decoder1.eval()