# Text Summarization of news articels
Using Seq2Seq model in Summarizer.py.


In [1]:
import os
from collections import Counter

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub


import Summarizer
import summarizer_data_utils
import summarizer_model_utils


In [2]:
print(tf.__version__)

1.11.0


## The data
The data is the 'all-the-news'-dataset from Kaggle. It contains about 140,000 news articles and the headlines of those articles. The headlines will serve as our summaries in this case.

Link->
https://www.kaggle.com/snapcrack/all-the-news



### Reading data

In [3]:
# The dataset consists of 3 CSV files...

data = pd.read_csv('./articles1.csv',
                   encoding='utf-8')
data1 = pd.read_csv('./articles2.csv',
                    encoding='utf-8')
data2 = pd.read_csv('./articles3.csv',
                    encoding='utf-8')


In [4]:
# We will concatenate them.
data = pd.concat([data, data1, data2])
data.shape

(142570, 10)

In [5]:
# We are only going to use Title and Content.
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
Counter(data.publication)

Counter({'New York Times': 7803,
         'Breitbart': 23781,
         'CNN': 11488,
         'Business Insider': 6757,
         'Atlantic': 7179,
         'Fox News': 4354,
         'Talking Points Memo': 5214,
         'Buzzfeed News': 4854,
         'National Review': 6203,
         'New York Post': 17493,
         'Guardian': 8681,
         'NPR': 11992,
         'Reuters': 10710,
         'Vox': 4947,
         'Washington Post': 11114})

In [7]:
#data = data[data.publication != 'Breitbart']

In [8]:
# Checking columns with NULL values
data.isnull().sum()

Unnamed: 0         0
id                 0
title              2
publication        0
author         15876
date            2641
year            2641
month           2641
url            57011
content            0
dtype: int64

In [9]:
# Dropping those with NULL values
data.dropna(subset=['title'], inplace = True)

In [10]:
# Renaming the features and labels
# Taking only those Two

data.rename(index = str, columns = {'title':'Summary', 'content':'Text'}, inplace = True)
data = data[['Summary', 'Text']]

In [11]:
# Verifying that no NULL values are present
data.isnull().sum()

Summary    0
Text       0
dtype: int64

In [12]:
# Checking the Data
data.head()

Unnamed: 0,Summary,Text
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t..."
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ..."


In [13]:
# Look to summary...
for x in data.Summary[:10]:
    print(x)

House Republicans Fret About Winning Their Health Care Suit - The New York Times
Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial Bias, Dies at 106 - The New York Times
Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times
Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times
Sick With a Cold, Queen Elizabeth Misses New Year’s Service - The New York Times
Taiwan’s President Accuses China of Renewed Intimidation - The New York Times
After ‘The Biggest Loser,’ Their Bodies Fought to Regain Weight - The New York Times
First, a Mixtape. Then a Romance. - The New York Times
Calling on Angels While Enduring the Trials of Job - The New York Times


In [14]:
# Look to Whole Text - (1st row)...
print(data.Text[0])

WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been d

In [15]:
# Checking lenghts of both Summary and Text...
# we will not use all of the examples, but only pick some 

len_summaries = [len(summary) for i, summary in enumerate(data.Summary)]
len_texts = [len(text) for text in data.Text]

In [16]:
#print(len_summaries)
#print(len_texts)

In [17]:
# Most Common lengths of Summary and Text

len_summaries_counted = Counter(len_summaries).most_common()
len_texts_counted = Counter(len_texts).most_common()

print("Most common Summary length--")
print(len_summaries_counted[:10])
print("Most common Text length--")
print(len_texts_counted[:10])


Most common Summary length--
[(63, 3512), (64, 3451), (65, 3364), (62, 3362), (60, 3344), (59, 3310), (61, 3284), (58, 3280), (67, 3016), (66, 3015)]
Most common Text length--
[(2878, 44), (13, 42), (1742, 41), (2457, 41), (1738, 41), (2199, 40), (2673, 40), (1548, 40), (2617, 40), (2913, 39)]


In [18]:
# Only use shorter texts for initial purpose

indices = [ind for ind, text in enumerate(data.Text) if 50 < len(text) < 200]
summaries_unprocessed = data.Summary[indices]
texts_unprocessed = data.Text[indices]

In [19]:
len(indices), len(texts_unprocessed), len(summaries_unprocessed)

(870, 870, 870)

In [20]:
# Endings of NYT and Breitbart are not relevant
# Removing those Extensions...

to_remove = ['- The New York Times', '- Breitbart']

summaries_unprocessed_clean = []
texts_unprocessed_clean = []

removed = 0
append = True
for sentence in summaries_unprocessed:
    append = True
    for r in to_remove:
        if sentence.endswith(r):
            sentence = sentence.replace(r, '.')
            summaries_unprocessed_clean.append(sentence.replace(r, '.'))
            removed+=1
            append = False
            break
            
    if append:
        summaries_unprocessed_clean.append(sentence)
       


In [21]:
len(summaries_unprocessed_clean), len(texts_unprocessed)

(870, 870)

### Cleaning and Preparing the Data

In [22]:
# We have the option to keep_most or not. in this case we do not want 'to Keep Most', 
# i.e. we will only keep letters and numbers. 
# (to improve the model, this preprocessing step should be refined)

processed_texts, processed_summaries, words_counted = summarizer_data_utils.preprocess_texts_and_summaries(
    texts_unprocessed,
    summaries_unprocessed_clean,
    keep_most=False)

Processing Time:  0.3985011577606201


In [23]:
# Some of the texts are empty remove those. 
processed_texts_clean = []
processed_summaries_clean = []

for t, s in zip(processed_texts, processed_summaries):
    if t != [] and s != []:
        processed_texts_clean.append(t)
        processed_summaries_clean.append(s)

### Creating Lookup dictionaries

Here each words gets an int value (high or low, depending on its frequency in our corpus). Those help us to later convert the texts into numbers.

We also add special tokens. "EndOfSentence" and "StartOfSentence" are crucial for the Seq2Seq model we later use.
Pad token, because all summaries and texts in a batch need to have the same length, pad token helps us do that.

So we need 2 lookup dicts:
 - From word to index 
 - from index to word. 

In [24]:
# Most often the words only appear only once
# min_occureces set to 2 which reduces our vocabulary by more than 1/2.
specials = ["<EOS>", "<SOS>","<PAD>","<UNK>"]
word2ind, ind2word,  missing_words = summarizer_data_utils.create_word_inds_dicts(words_counted,
                                                                                  specials = specials,
                                                                                  min_occurences = 2)
print(len(word2ind), len(ind2word), len(missing_words))


2500 2500 2760


### Pretrained embeddings

There are various pretrained embeddings like Glove or from tf_hub (optional).

Acc to review the TF_HUB works better...
So we have used TF_HUB.

In [25]:
# Using embeddings from tf.hub. 
# embed = hub.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
embed = hub.Module("https://tfhub.dev/google/Wiki-words-250/1")
emb = embed([key for key in word2ind.keys()])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    TFHUB_DOWNLOAD_PROGRESS=1
    embedding = sess.run(emb)

INFO:tensorflow:Using /var/folders/wx/qxytpw4s10d12r3r46wjvwqr0000gn/T/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/Wiki-words-250/1'.
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/Wiki-words-250/1'.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [49]:
embedding.shape

NameError: name 'embedding' is not defined

In [0]:
np.save('./tf_hub_embedding_NewsHeadlines.npy', embedding)

### Converting Text and Summaries
Converting them to numbers first of all. And we also append the SOS/GO and EOS tokens.

In [0]:
# Converts words in Texts and Summaries to indices...
converted_texts, unknown_words_in_texts = summarizer_data_utils.convert_to_inds(processed_texts_clean,
                                                                                word2ind,
                                                                                eos = False)

In [0]:
converted_summaries, unknown_words_in_summaries = summarizer_data_utils.convert_to_inds(processed_summaries_clean,
                                                                                        word2ind,
                                                                                        eos = True,
                                                                                        sos = True)

In [None]:
# Checking the work... 
print(summarizer_data_utils.convert_inds_to_text(converted_texts[0], ind2word))
print(summarizer_data_utils.convert_inds_to_text(converted_summaries[0], ind2word))


## The model

First we define the hyperparameters we want to use. Then we create our Summarizer and call the function .build_graph(), which as the name suggests, builds the computation graph. 

Then we can train the model using .train()

After training we can try our model using .infer()

### Training

We use 90% of the data as training set and 10% as validation set. 

In [0]:
# Model Hyperparameters-
num_layers_encoder = 4
num_layers_decoder = 4
rnn_size_encoder = 300
rnn_size_decoder = 300

batch_size = 32
epochs = 100
clip = 5
keep_probability = 0.8
learning_rate = 0.0005
max_lr=0.005
learning_rate_decay_steps = 100
learning_rate_decay = 0.90

pretrained_embeddings_path = './tf_hub_embedding_headlines.npy'
summary_dir = os.path.join('./tensorboard/headlines')

use_cyclic_lr = True
inference_targets=True


In [None]:
# Building graph and training the model...
summarizer_model_utils.reset_graph()
summarizer = Summarizer.Summarizer(word2ind,
                                   ind2word,
                                   save_path='./models/my_model',
                                   mode='TRAIN',
                                   num_layers_encoder = num_layers_encoder,
                                   num_layers_decoder = num_layers_decoder,
                                   rnn_size_encoder = rnn_size_encoder,
                                   rnn_size_decoder = rnn_size_decoder,
                                   batch_size = batch_size,
                                   clip = clip,
                                   keep_probability = keep_probability,
                                   learning_rate = learning_rate,
                                   max_lr=max_lr,
                                   learning_rate_decay_steps = learning_rate_decay_steps,
                                   learning_rate_decay = learning_rate_decay,
                                   epochs = epochs,
                                   pretrained_embeddings_path = pretrained_embeddings_path,
                                   use_cyclic_lr = use_cyclic_lr,)


summarizer.build_graph()
summarizer.train(converted_texts, 
                 converted_summaries)


### Inference



In [None]:
summarizer_model_utils.reset_graph()
summarizer = Summarizer.Summarizer(word2ind,
                                   ind2word,
                                   './models/my_model',
                                   'INFER',
                                   num_layers_encoder = num_layers_encoder,
                                   num_layers_decoder = num_layers_decoder,
                                   batch_size = len(converted_texts[:50]),
                                   clip = clip,
                                   keep_probability = 1.0,
                                   learning_rate = 0.0,
                                   beam_width = 5,
                                   rnn_size_encoder = rnn_size_encoder,
                                   rnn_size_decoder = rnn_size_decoder,
                                   inference_targets = False,
                                   pretrained_embeddings_path = pretrained_embeddings_path)

summarizer.build_graph()
preds = summarizer.infer(converted_texts[:50],
                         restore_path =  './models/my_model',
                         targets = converted_summaries[:50])

In [None]:
# Show results...
summarizer_model_utils.sample_results(preds,
                                      ind2word,
                                      word2ind,
                                      converted_summaries[:50],
                                      converted_texts[:50])