This code was compiled by Matthew Demeter, Duncan Boynton, and Grant Salzsiedler, but we had help from the following sources, providing large snippets of code, insight on how to train an LSTM autoencoder, and more. Most of this code is not original, but rather bits and pieces from these articles and minor tweaks. Our sole objectives were to create a working model and learn more about how these models work, how they are trained, and what key challenges are in developing and testing these models. 

https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/

https://blog.paperspace.com/introduction-to-seq2seq-models/

https://blog.paperspace.com/implement-seq2seq-for-text-summarization-keras/

https://towardsdatascience.com/text-summarization-from-scratch-using-encoder-decoder-network-with-attention-in-keras-5fa80d12710e?gi=f9cd15db2ccd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from attention import AttentionLayer

In [None]:
import pickle

# load from file
stories = pickle.load(open('/content/drive/MyDrive/cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

In [None]:
# required for rest of notebook to run
# primarily using Keras and Tensorflow libraries

!pip install tf
import pandas as pd
%matplotlib inline
from matplotlib import pyplot
import numpy as np
import random
import requests as rq
import sys
import io
from bs4 import BeautifulSoup
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers import Input, Embedding, TimeDistributed, RepeatVector, Concatenate
from keras.models import Model
from keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
from collections import Counter
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
import keras
import keras.callbacks
from keras.callbacks import TensorBoard
import tensorflow as tf
tf.test.gpu_device_name()
%load_ext tensorboard

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
print(len(stories))

In [None]:
# this is just a safety measure before manipulating the articles, creating
# a deep copy that can be referenced any time
import copy
stories2 = copy.deepcopy(stories)
for story in stories:
  story['story'] = " ".join(story['story'])
  story['highlights'] = " ".join(story['highlights'])
for i in range(len(stories)): 
  stories[i]['highlights'] = stories2[i]['highlights'][0]

articles = []
sums = []

# creating list object with strings of each article and then summaries
# note for our current dataset, "highlights" are summaries of the story 
for story in stories:
  articles.append(story['story'])
  sums.append(story['highlights'])

In [None]:
# useful for seeing the length of different stories and how many.
# fall under an arbitrary threshold. 80 can be changed to any
# number but recommended to keep article length low
cnt = 0
for i in articles:
    if len(i.split()) <= 80:
      cnt = cnt + 1
print(cnt / len(articles))

# src and sum text length. src is the news article length
# this code could be improved by not only taking articles in this 
# length but actually taking all of them and just cutting off words
# above the src_txt_length word count
src_txt_length = 100
sum_txt_length = 18

In [None]:
# puts articles and summaries into np array
cleaned_text = np.array(articles)
cleaned_summary= np.array(sums)

short_text = []
short_summary = []

# filters out articles and summaries that are too long. this is where improvement
# could be made to include longer articles and just cut them off
for i in range(len(cleaned_text)):
    if len(cleaned_summary[i].split()) <= sum_txt_length and len(cleaned_text[i].split()) <= src_txt_length:
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
post_pre = pd.DataFrame({'text': short_text,'summary': short_summary})

# Add tokens labeled 'sostok' and 'eostok' to guide autoencoder through its 
# summery generation
post_pre['summary'] = post_pre['summary'].apply(lambda x: 'sostok ' + x + ' eostok')

post_pre.head(5)

In [None]:
# standard train test split of data for training and then performance testing
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(
    np.array(post_pre["text"]),
    np.array(post_pre["summary"]),
    test_size=0.1,
    random_state=0,
    shuffle=True,
)

In [None]:
# threshold that determines if a word is common enough to tokenized 
thresh = 3

cnt = 0
tot_cnt = 0

for key, value in x_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
# how many words don't meet the threshold
print("% of rare words in vocabulary: ", (cnt / tot_cnt) * 100)

# create first tokenizer that ignores "rare" words
x_tokenizer = Tokenizer(num_words = tot_cnt - cnt) 
x_tokenizer.fit_on_texts(list(x_tr))

# transform text into numbers
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

# Padding for uniform size
x_tr = pad_sequences(x_tr_seq,  maxlen=src_txt_length, padding='post')
x_val = pad_sequences(x_val_seq, maxlen=src_txt_length, padding='post')

# Size of vocabulary (+1 for padding token sostok and eostok)
x_voc = x_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))


In [None]:
# basically do the same thing but for the summaries
# 
# it is beneficial to create separate tokenizers
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(y_tr))

thresh = 3

cnt = 0
tot_cnt = 0

for key, value in y_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1

print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)

y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(y_tr))

y_tr_seq = y_tokenizer.texts_to_sequences(y_tr) 
y_val_seq = y_tokenizer.texts_to_sequences(y_val) 

y_tr = pad_sequences(y_tr_seq, maxlen=sum_txt_length, padding='post')
y_val = pad_sequences(y_val_seq, maxlen=sum_txt_length, padding='post')


y_voc = y_tokenizer.num_words + 1

print("Size of vocabulary in Y = {}".format(y_voc))

In [None]:
from keras import backend as K 
K.clear_session() 

latent_dim = 200
embedding_dim = 300

# ENCODER STARTS HERE
encoder_inputs = Input(shape=(src_txt_length, ))

# Embedding 
enc_emb = Embedding(x_voc, embedding_dim,
                    trainable=True)(encoder_inputs)

# LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)

# LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)

# LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_state=True,
                     return_sequences=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding
dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True,
                    return_state=True, dropout=0.4,
                    recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])


# Dense layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [None]:
#Need high ram colab Pro
history = model.fit(
    [x_tr, y_tr[:, :-1]],
    y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:, 1:],
    epochs=20,
    callbacks=[es],
    batch_size=512,
    validation_data=([x_val, y_val[:, :-1]],
                     y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:
                     , 1:]),
    )

In [None]:
# optional code to save your model after training

#model.save('/content/drive/MyDrive/model4')

In [None]:
# code to access your model once it has been trained
#saved_model = keras.models.load_model('/content/drive/MyDrive/model2')
#model = saved_model

In [None]:
# mapping back from the outputted summary in tokenized form to word form
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index


In [None]:
# encoder inference
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(src_txt_length,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# Final decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Chose the 'start' word as the first word of the target sequence
    target_seq[0, 0] = target_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index+1]

        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token

            # Exit condition: either hit max length or find stop word.
            if (sampled_token == 'end' or len(decoded_sentence.split()) >= (sum_txt_length-1)):
                stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
        newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+reverse_source_word_index[i]+' '
    return newString

In [None]:
for i in range(25, 30):
  print("Review:",seq2text(x_val[i]))
  print("Original summary:",seq2summary(y_val[i]))
  print("Predicted summary:",decode_sequence(x_val[i].reshape(1,src_txt_length)))
  print("\n")