In [1]:
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

if device.type == "cuda":
    print("Using GPU")
else:
    print("Using CPU")


Using GPU


In [2]:
dataset = pd.read_csv('news_summary.csv', encoding = 'latin1')
dataset.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & AI w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [3]:
dataset.tail()

Unnamed: 0,headlines,text
98396,CRPF jawan axed to death by Maoists in Chhatti...,A CRPF jawan was on Tuesday axed to death with...
98397,First song from Sonakshi Sinha's 'Noor' titled...,"'Uff Yeh', the first song from the Sonakshi Si..."
98398,'The Matrix' film to get a reboot: Reports,"According to reports, a new version of the 199..."
98399,Snoop Dogg aims gun at clown dressed as Trump ...,A new music video shows rapper Snoop Dogg aimi...
98400,Madhesi Morcha withdraws support to Nepalese g...,"Madhesi Morcha, an alliance of seven political..."


In [4]:
# Preprocessing function
def preprocess_text(df, col):
    # converting language data in data frame to lower case and then storing in sentence variable
    sentence = df[col].str.lower()
    sentence = sentence.str.replace('[^0-9A-Za-z\s]+', '', regex=True)
    sentence = sentence.str.normalize('NFD')
    #encoding the string in sentence in UTF-8 format and ignoring errors if any
    sentence = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8')
    return sentence

  sentence = sentence.str.replace('[^0-9A-Za-z\s]+', '', regex=True)


In [5]:
dataset['headlines'] = preprocess_text(dataset, 'headlines')
dataset['text'] = preprocess_text(dataset, 'text')

In [6]:
dataset.tail()

Unnamed: 0,headlines,text
98396,crpf jawan axed to death by maoists in chhatti...,a crpf jawan was on tuesday axed to death with...
98397,first song from sonakshi sinhas noor titled uf...,uff yeh the first song from the sonakshi sinha...
98398,the matrix film to get a reboot reports,according to reports a new version of the 1999...
98399,snoop dogg aims gun at clown dressed as trump ...,a new music video shows rapper snoop dogg aimi...
98400,madhesi morcha withdraws support to nepalese g...,madhesi morcha an alliance of seven political ...


In [7]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

# make the token 1 and 2 ,0 is already reserved for the [pad]
class Vocab:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:'PAD',1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count SOS and EOS

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [8]:
#create vocab instance
vocab = Vocab()

_ = dataset.text.apply(lambda x: vocab.add_sentence(x))
_ = dataset.headlines.apply(lambda x: vocab.add_sentence(x))

In [9]:
vocab.n_words

120908

In [10]:
#calculate and store the length of each text and headline
dataset['text_length'] = dataset.text.str.split(' ').apply(lambda x: len(x))
dataset['headlines_length'] = dataset.headlines.str.split(' ').apply(lambda x: len(x))

In [11]:
dataset.headlines_length.max(), dataset.text_length.max()

(np.int64(18), np.int64(92))

In [12]:
MAX_LENGTH_INPUT = 100
MAX_LENGTH_TARGET = 20

In [13]:
# function to convert a sentence into a list of indices based on the vocabulary
def indexes_from_sentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

In [14]:
# convert a sentence into a list of indices, and then appends the EOS_token
def tensor_from_sentence(vocab, sentence):
    indexes = indexes_from_sentence(vocab, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


In [15]:
# prepare dataloader
def get_dataloader(dataset, batch_size):
    n = dataset.shape[0]
    input_ids = np.zeros((n, MAX_LENGTH_INPUT), dtype=np.int64)
    target_ids = np.zeros((n, MAX_LENGTH_TARGET), dtype=np.int64)

    for idx in range(n):
        inp_ids = indexes_from_sentence(vocab, dataset.text.iloc[idx])
        tgt_ids = indexes_from_sentence(vocab, dataset.headlines.iloc[idx])

        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)

        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader

In [16]:
#define encoder class
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        # Make LSTM bidirectional
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        # When LSTM is bidirectional, the output, hidden and cell state will be for both directions
        output, (hidden, cell) = self.lstm(embedded)
        return output, (hidden, cell)

In [17]:
#initialize encoder
enc = Encoder(100, 64)
print(enc)

Encoder(
  (embedding): Embedding(100, 64)
  (lstm): LSTM(64, 64, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [18]:
#creating simulated input
x = torch.randint(1, 100, (1, 61))

In [19]:
#pass tensor x through encoder model
enc_outputs, enc_hidden = enc.forward(x)

In [20]:
enc_hidden_h_shape = enc_hidden[0].shape
enc_hidden_c_shape = enc_hidden[1].shape
enc_outputs_shape = enc_outputs.shape

print("enc_outputs_shape:", enc_outputs_shape)
print("enc_hidden_h_shape:", enc_hidden_h_shape)
print("enc_hidden_c_shape:", enc_hidden_c_shape)

enc_outputs_shape: torch.Size([1, 61, 128])
enc_hidden_h_shape: torch.Size([2, 1, 64])
enc_hidden_c_shape: torch.Size([2, 1, 64])


In [21]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size * 2  # Adjust hidden size if states are concatenated
        self.embedding = nn.Embedding(output_size, hidden_size)
        # Assuming concatenation of hidden states, adjust LSTM input size
        self.lstm = nn.LSTM(hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)

        # Combine or adapt encoder_hidden to suit unidirectional decoder
        encoder_hidden = self.adapt_hidden(encoder_hidden)

        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH_TARGET):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

        return decoder_outputs, decoder_hidden

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        return output, hidden

    def adapt_hidden(self, hidden):
        # Assuming hidden is a tuple (hidden_state, cell_state) each with dimensions [2, batch_size, hidden_size]
        hidden_state, cell_state = hidden
        # Concatenate the forward and backward states
        hidden_state = torch.cat((hidden_state[0:hidden_state.size(0):2], hidden_state[1:hidden_state.size(0):2]), dim=2)
        cell_state = torch.cat((cell_state[0:cell_state.size(0):2], cell_state[1:cell_state.size(0):2]), dim=2)
        return (hidden_state, cell_state)


In [22]:
#generate random target tensors
tgt_tensor = torch.randint(1, 100, (1, 20))
tgt_tensor.shape

torch.Size([1, 20])

In [23]:
#initialise the decoder class
dec = Decoder(64, 100)

In [24]:
# Assuming you have a device variable defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move your models to the designated device
encoder = enc.to(device)
decoder = dec.to(device)


In [25]:
# When you load or create tensors, send them to the same device
enc_outputs = enc_outputs.to(device)
(h,c) = enc_hidden
enc_hidden_gpu=h.to(device),c.to(device)
tgt_tensor = tgt_tensor.to(device)


In [26]:
#execute forward pass of decoder instance
decoder_outputs, decoder_hidden= dec.forward(enc_outputs, enc_hidden_gpu, tgt_tensor)

In [27]:
decoder_hidden_h_shape = decoder_hidden[0].shape
decoder_hidden_c_shape = decoder_hidden[1].shape
decoder_outputs_shape = decoder_outputs.shape

print("decoder_outputs_shape:", decoder_outputs_shape)
print("decoder_hidden_h_shape:", decoder_hidden_h_shape)
print("decoder_hidden_c_shape:", decoder_hidden_c_shape)

decoder_outputs_shape: torch.Size([1, 20, 100])
decoder_hidden_h_shape: torch.Size([1, 1, 128])
decoder_hidden_c_shape: torch.Size([1, 1, 128])


In [28]:
train_dataset, test_dataset = train_test_split(dataset, shuffle=True, test_size=0.2, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, shuffle=True, test_size=0.1, random_state=42)

print(f"Train set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Train set size: 70848
Validation set size: 7872
Test set size: 19681


In [29]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    total_loss = 0
    for input_tensor, target_tensor in tqdm(dataloader):
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate_model(dataloader, encoder, decoder, criterion):
    total_loss = 0
    with torch.no_grad():
        for input_tensor, target_tensor in tqdm(dataloader):
            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, decoder_hidden = decoder(encoder_outputs, encoder_hidden, target_tensor)

            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)
            )
            total_loss += loss.item()

    return total_loss / len(dataloader)

def train_model(train_dataloader, valid_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
                print_every=100, plot_every=100):
    print_loss_total = 0  # Reset every print_every
    train_losses = []
    valid_losses = []

    best_val_loss = float('inf')

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        print(f"Epoch: {epoch}/{n_epochs}")
        # Training
        train_loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer,
                                 decoder_optimizer, criterion)
        print_loss_total += train_loss
        train_losses.append(train_loss)

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f"Train Loss: {round(print_loss_avg, 3)}")

        # Validation
        print('Validation....')
        valid_loss = evaluate_model(valid_dataloader, encoder, decoder, criterion)
        valid_losses.append(valid_loss)
        print(f"Validation Loss: {round(valid_loss, 3)}")

        # Save the model if it has the best validation loss so far
        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            torch.save(encoder.state_dict(), 'best_encoder.pth')
            torch.save(decoder.state_dict(), 'best_decoder.pth')
            print(f"Saved Best Model at Epoch: {epoch}")

    return train_losses, valid_losses


In [30]:
hidden_size = 256
batch_size = 64
n_epochs =10
print('Making DataLoaders .... .....  ')
train_dataloader = get_dataloader(train_dataset, batch_size)
val_dataloader=get_dataloader(val_dataset,batch_size)
print('Defining Encoder and Decoder .....')
encoder = Encoder(vocab.n_words, hidden_size).to(device)
decoder = Decoder(hidden_size, vocab.n_words).to(device)

train_model(train_dataloader, val_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=1, plot_every=100)

Making DataLoaders .... .....  
Defining Encoder and Decoder .....
Epoch: 1/10


100%|██████████| 1107/1107 [05:00<00:00,  3.69it/s]


Train Loss: 3.731
Validation....


100%|██████████| 123/123 [00:05<00:00, 22.83it/s]


Validation Loss: 3.243
Saved Best Model at Epoch: 1
Epoch: 2/10


100%|██████████| 1107/1107 [04:53<00:00,  3.77it/s]


Train Loss: 2.86
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.33it/s]


Validation Loss: 2.867
Saved Best Model at Epoch: 2
Epoch: 3/10


100%|██████████| 1107/1107 [05:00<00:00,  3.68it/s]


Train Loss: 2.322
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.38it/s]


Validation Loss: 2.711
Saved Best Model at Epoch: 3
Epoch: 4/10


100%|██████████| 1107/1107 [04:59<00:00,  3.69it/s]


Train Loss: 1.902
Validation....


100%|██████████| 123/123 [00:05<00:00, 22.46it/s]


Validation Loss: 2.654
Saved Best Model at Epoch: 4
Epoch: 5/10


100%|██████████| 1107/1107 [04:59<00:00,  3.70it/s]


Train Loss: 1.578
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.68it/s]


Validation Loss: 2.652
Saved Best Model at Epoch: 5
Epoch: 6/10


100%|██████████| 1107/1107 [05:02<00:00,  3.67it/s]


Train Loss: 1.343
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.69it/s]


Validation Loss: 2.671
Epoch: 7/10


100%|██████████| 1107/1107 [04:57<00:00,  3.72it/s]


Train Loss: 1.166
Validation....


100%|██████████| 123/123 [00:05<00:00, 20.98it/s]


Validation Loss: 2.719
Epoch: 8/10


100%|██████████| 1107/1107 [04:52<00:00,  3.78it/s]


Train Loss: 1.025
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.86it/s]


Validation Loss: 2.767
Epoch: 9/10


100%|██████████| 1107/1107 [05:00<00:00,  3.69it/s]


Train Loss: 0.908
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.39it/s]


Validation Loss: 2.829
Epoch: 10/10


100%|██████████| 1107/1107 [04:59<00:00,  3.69it/s]


Train Loss: 0.808
Validation....


100%|██████████| 123/123 [00:05<00:00, 21.35it/s]

Validation Loss: 2.875





([3.7307284377993177,
  2.8603443285314047,
  2.3215775067674866,
  1.902180323325091,
  1.5775539711346578,
  1.3429089572479194,
  1.1661324346507038,
  1.0251278963407742,
  0.9077241382517035,
  0.807822693761879],
 [3.2432793233452775,
  2.866821323953024,
  2.7105596317508356,
  2.654367452714501,
  2.651861826578776,
  2.6714988685235745,
  2.719182274205898,
  2.7670969439715876,
  2.8292864842143484,
  2.875345516980179])

In [31]:
def evaluate_test_samples(encoder, decoder, sentence, vocab):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(vocab, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(vocab.index2word[idx.item()])

    return decoded_words

In [32]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
def evaluateRandomly_train(encoder, decoder, vocab, n=10):
    for i in range(n):
        print(i)
        eval_sample = train_dataset.iloc[i:i+1, :]
        print('news_article > ', eval_sample['text'].iloc[0])
        headline = eval_sample['headlines'].iloc[0]
        print('original_headline = ', headline)
        output_words = evaluate_test_samples(encoder, decoder, eval_sample.text.iloc[0], vocab)
        output_sentence = ' '.join(output_words)
        print('predicted_headline < ', output_sentence)
        print(f"meteor score: {nltk.translate.meteor_score.single_meteor_score(headline.split(), output_sentence.split())}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
encoder.eval()
decoder.eval()

evaluateRandomly_train(encoder, decoder, vocab)

0
news_article >  a study by financial services company ubs has revealed a person working in mumbai has to work 1147 days to afford the iphone x while a delhi person has to work 1005 days for the same a cairo person has to work the most for 1332 days while a person in zurich can afford iphone x in only 47 days
original_headline =  how many days one has to work around the world to buy iphone x
predicted_headline <  iphone x days <EOS>
meteor score: 0.19658119658119655
1
news_article >  catalonias independence from spain would not enjoy international recognition france said on monday ahead of catalan regional governments announcement of last weeks independence vote result this crisis needs to be resolved through dialogue at all levels of spanish politics france urged earlier catalonia had claimed that 90 of the participants voted in favour of independence
original_headline =  catalan independence would not be recognised france
predicted_headline <  catalan spain <EOS>
meteor score: 0.075

In [34]:
def evaluateRandomly_test(encoder, decoder, vocab, n=10):
    for i in range(n):
        print(i)
        eval_sample = test_dataset.iloc[i:i+1, :]
        print('news_article > ', eval_sample['text'].iloc[0])
        headline = eval_sample['headlines'].iloc[0]
        print('original_headline = ', headline)
        output_words = evaluate_test_samples(encoder, decoder, eval_sample.text.iloc[0], vocab)
        output_sentence = ' '.join(output_words)
        print('predicted_headline < ', output_sentence)
        print('')
        print(f"'meteor score:' {nltk.translate.meteor_score.single_meteor_score(headline.split(), output_sentence.split())}")

In [35]:
encoder.eval()
decoder.eval()

evaluateRandomly_test(encoder, decoder, vocab)

0
news_article >  students in karnataka will get extra marks if their parents cast votes in the upcoming assembly elections the associated management of primary and secondary schools has announced the encouraging marks will be added in the 201819 academic year the association said after casting their votes parents can visit member schoolsand confirm that they voted by showing the indelible ink mark
original_headline =  ktaka students to get extra marks if parents vote in polls
predicted_headline <  ktaka parents to be displayed <EOS>

'meteor score:' 0.14285714285714285
1
news_article >  syrian antiaircraft defences on monday shot down missiles over two air bases syrias state media said the missiles targeted shayrat air base in the homs province and another base northeast of the capital damascus this comes days after the us uk and france launched air strikes on syrian chemical weapons facilities in retaliation for the alleged chemical attack in douma
original_headline =  syria shoots d