## IMPORT

In [None]:
import re
import os
import ssl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import nltk
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords
# !pip install contractions
import contractions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import backend as K 
from tensorflow.keras.layers import Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.python.framework.ops import disable_eager_execution

# !pip install tensorflow_hub
# import tensorflow_hub as hub
# from scipy import spatial
tf.compat.v1.experimental.output_all_intermediates(True)
disable_eager_execution()

In [None]:
path = "../data/"
train_data = pd.read_csv(os.path.join(path, 'train.csv'), names = ['Rating','Title','Review'],nrows=10000)
test_data = pd.read_csv(os.path.join(path, 'test.csv'), names = ['Rating','Title','Review'], nrows = 1000)

In [None]:
# Since null values are very low as compared to the whole training dataset - we will drop those
train_data = train_data.dropna()
train_data.reset_index(inplace=True, drop=True)

In [None]:
# Since null values are very low as compared to the whole training dataset - we will drop those
test_data = test_data.dropna()
test_data.reset_index(inplace=True, drop=True)

In [None]:
print(f"TRAIN DATA: {train_data.shape}")
print(f"TEST DATA: {test_data.shape}")

In [None]:
#looking at some reviews
for i in range(5):
    print("Review #",i+1)
    print(train_data.Review[i])
    print(train_data.Title[i])
    print()

In [None]:
#Remove unwanted characters, stopwords, and format the matter to create fewer nulls word embeddings
stops = stopwords.words('english')
def clean_matter(matter, remove_stopwords = True, stops = stops):
    # Convert words to lower case
    matter = str(matter)
    matter = matter.lower()
    
    # Replace contractions with their longer forms 
    matter = ' '.join([contractions.fix(word) for word in matter.split(" ")])    
    
    # Format words and remove unwanted characters
    matter = re.sub(r'https?:\/\/.*[\r\n]*', '', matter, flags=re.MULTILINE)
    matter = re.sub(r'\<a href', ' ', matter)
    matter = re.sub(r'&amp;', '', matter) 
    matter = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', matter)
    matter = re.sub(r'<br />', ' ', matter)
    matter = re.sub(r'\'', ' ', matter)
    
    # Optionally, remove stop words
    if remove_stopwords:
        matter = matter.split()
        matter = [w for w in matter if not w in stops]
        matter = " ".join(matter)

    return matter

In [None]:
print(f"TRAIN DATA: {train_data.shape}")
print(f"TEST DATA: {test_data.shape}")

## Cleaning Training Data

In [None]:
train_data['Title'] = train_data['Title'].apply(lambda x: clean_matter(x, remove_stopwords = False))
train_data['Review'] = train_data['Review'].apply(lambda x: clean_matter(x, remove_stopwords = True))
train_data['Title'] = train_data['Title'].apply(lambda x : '_START_ '+ x + ' _END_')

for i in range(2):
    print('Title:', train_data['Title'][i],'Review:', train_data['Review'][i], sep='\n')
    print()

In [None]:
Title_length = [len(x.split()) for x in train_data.Title]
Review_length = [len(x.split()) for x in train_data.Review]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (10,5))
ax1.hist(Title_length, bins = 20)
ax2.hist(Review_length, bins = 20)

ax1.title.set_text("Words in Titles")
ax2.title.set_text("Words in Reviews")
plt.show()

## Cleaning Testing Data

In [None]:
test_data['Title'] = test_data['Title'].apply(lambda x: clean_matter(x, remove_stopwords = False))
test_data['Review'] = test_data['Review'].apply(lambda x: clean_matter(x, remove_stopwords = True))
test_data['Title'] = test_data['Title'].apply(lambda x : '_START_ '+ x + ' _END_')

for i in range(2):
    print('Title:', test_data['Title'][i],'Review:', test_data['Review'][i], sep='\n')
    print()

In [None]:
Title_length_test = [len(x.split()) for x in test_data.Title]
Review_length_test = [len(x.split()) for x in test_data.Review]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (10,5))
ax1.hist(Title_length_test, bins = 20)
ax2.hist(Review_length_test, bins = 20)

ax1.title.set_text("Words in Title")
ax2.title.set_text("Words in Review")
plt.show()

In [None]:
print(f"TRAIN DATA: {train_data.shape}")
print(f"TEST DATA: {test_data.shape}")

## GLOVE EMBEDDING

In [None]:
# import pickle
glove_size = 300
# with open('../data/glove.840B.300d.pkl', 'rb') as fp:
#     glove = pickle.load(fp)

f = open(os.path.join(path, 'glove.840B.300d.txt'), encoding='utf-8')
glove = dict()
i = 1
for line in f:
    values = line.split(" ")
    if i < 5:
        print(values)
        i = i + 1
    glove[values[0]] = np.asarray(values[1:], dtype='float32')
f.close()

In [None]:
words_source_ALL = []
for i in train_data['Review'] :
  words_source_ALL.extend(i.split(' '))
for i in test_data['Review'] :
  words_source_ALL.extend(i.split(' '))

print("TOTAL WORDS: ", len(words_source_ALL))

words_source_ALL = set(words_source_ALL)
print("UNIQUE WORDS: ", len(words_source_ALL))

inter_words = set(glove.keys()).intersection(words_source_ALL)
print("WORDS COMMON IN GLOVE AND CORPUS: {} = {}% ".format(len(inter_words), np.round((float(len(inter_words))/len(words_source_ALL))
*100)))

words_corpus_source_ALL = {}
words_glove = set(glove.keys())
for i in words_source_ALL:
  if i in words_glove:
    words_corpus_source_ALL[i] = glove[i]
print("LENGTH OF WORD2VEC: ", len(words_corpus_source_ALL))

In [None]:
def num(text):
  words = [w for w in text.split() if not w in inter_words]
  return len(words)

train_data['unique'] = train_data['Review'].apply(num)

In [None]:
train_data = train_data[train_data['unique'] < 4]
train_data.reset_index(inplace=True, drop=True)

In [None]:
max_length_x = max(Review_length + Review_length_test)
max_length_y = max(Title_length + Title_length_test)

In [None]:
test_data.Review =  pd.Series(test_data.Review, dtype="string")
test_data.Title =  pd.Series(test_data.Title, dtype="string")

train_data.Review =  pd.Series(train_data.Review, dtype="string")
train_data.Title =  pd.Series(train_data.Title, dtype="string")

In [None]:
all_sentences = train_data.Review.tolist() + train_data.Title.tolist() + test_data.Review.tolist() + test_data.Title.tolist()

x_t = Tokenizer()
x_t.fit_on_texts(all_sentences)
x_vocab_size = len(x_t.word_index) + 1


encoded_xtrain = x_t.texts_to_sequences(train_data['Review'])
encoded_xtest = x_t.texts_to_sequences(test_data['Review'])

padded_xtrain = pad_sequences(encoded_xtrain, maxlen=max_length_x, padding='post')
padded_xtest = pad_sequences(encoded_xtest, maxlen=max_length_x, padding='post')

In [None]:
all_y_sentences = train_data.Title.tolist() + test_data.Title.tolist()

y_t = Tokenizer()
y_t.fit_on_texts(all_y_sentences)
y_vocab_size = len(y_t.word_index) + 1

encoded_ytrain = y_t.texts_to_sequences(train_data['Title'])
encoded_ytest = y_t.texts_to_sequences(test_data['Title'])

padded_ytrain = pad_sequences(encoded_ytrain, maxlen=max_length_y, padding='post')
padded_ytest = pad_sequences(encoded_ytest, maxlen=max_length_y, padding='post')

In [None]:
print(f'LOADED {len(glove)} WORD VECTORS.')

embedding_matrix = np.zeros((x_vocab_size, glove_size))
for word, i in x_t.word_index.items():
    embedding_vector = glove.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# LSTM Seq2Seq Model With ATTENTION Layer

In [None]:
class AttentionLayer(Layer):

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        encoder_out_seq, decoder_out_seq = inputs

        def energy_step(inputs, states):
          
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  
            
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            e_i = K.softmax(e_i)

            return e_i, [e_i]

        def context_step(inputs, states):
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            
            fake_state = K.zeros_like(inputs)  
            fake_state = K.sum(fake_state, axis=[1, 2])  
            fake_state = K.expand_dims(fake_state)  
            fake_state = K.tile(fake_state, [1, hidden_size])  
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  

        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )
        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]


latent_dim = 64

K.clear_session() 

encoder_inputs = Input(shape=(max_length_x,)) 
enc_emb = Embedding(x_vocab_size, glove_size, weights=[embedding_matrix],input_length=max_length_x, trainable=False)(encoder_inputs) 

#LSTM 
encoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm(enc_emb) 

# Decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(x_vocab_size, glove_size, weights=[embedding_matrix],input_length=max_length_x, trainable=False) 
dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])
decoder_dense = TimeDistributed(Dense(y_vocab_size, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input) 

model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
print(model.summary())

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', experimental_run_tf_function=False)

checkpoint_filepath = '../model/model.{epoch:02d}-{val_loss:.2f}.h5'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                                filepath = checkpoint_filepath,
                                save_weights_only = True,
                                monitor = 'val_loss', 
                                mode = 'min',
                                save_best_only = True, 
                                save_freq = "epoch")

es = EarlyStopping( monitor = 'val_loss', 
                    mode = 'min', 
                    verbose = 1, 
                    patience = 1)
history=model.fit(
                    [padded_xtrain, padded_ytrain[:,:-1]], 
                    padded_ytrain.reshape(padded_ytrain.shape[0], padded_ytrain.shape[1], 1)[:, 1:],
                    epochs = 10,
                    batch_size = 128, 
                    validation_split = 0.1, 
                    callbacks = [es, model_checkpoint_callback])

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# change path to new model if any issue
model.load_weights("../model/summ_model.h5")

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
reverse_target_word_index = y_t.index_word 
reverse_source_word_index = x_t.index_word 
target_word_index = y_t.word_index

In [None]:
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_length_x,latent_dim))

dec_emb2= dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

decoder_outputs2 = decoder_dense(decoder_inf_concat)

decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [None]:
def decode_sequence(input_seq):
    input_seq= input_seq.reshape(1,max_length_x)
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_word_index['start']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token
 
        if (sampled_token == 'end' or len(decoded_sentence.split()) >= (max_length_y-1)):
                stop_condition = True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
        newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+reverse_source_word_index[i]+' '
    return newString

In [None]:
test_data = pd.read_csv(path + 'test.csv', names = ['Rating','Title','Review'], nrows = 100)
for i in range(10):
    print('Original Review:', test_data.iloc[i, 2])
    print("Review:",seq2text(padded_xtest[i]))
    print("Original summary:",seq2summary(padded_ytest[i]))
    print("Predicted summary:",decode_sequence(padded_xtest[i]))
    print("\n")

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def BLEU_Score(y_test, y_pred):
    references = [[seq2summary(y_test).split(" ")]]
    candidates = [decode_sequence(y_pred.reshape(1,max_length_x)).split(" ")]
    return corpus_bleu(references, candidates)

scores=[]

for i in range(0,500):
    scores.append(BLEU_Score(padded_ytest[i],padded_xtest[i]))
    
print(np.mean(scores))