# A Gentle Introduction to Text Summarization in Machine Learning

---

## PART 0: Imports and Initializations

In [5]:
# NLTK modules
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# Tensorflow modules
import tensorflow as tf
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Miscellaneous utilities
import numpy as np
import pandas as pd; pd.set_option("display.max_colwidth", 200)
import re
import os
import bs4 as bs
from urllib import urlopen
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Silence miscellaneous warnings
import warnings; warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [6]:
from custom_modules import AttentionLayer

Here, we initialize our data processing engine for miscellaneous text data online.

In [7]:
# Downloading essential data from NLTK
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aakashsudhakar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aakashsudhakar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

---

## PART 1: Overview of Concept

### Two Major Types of Text Summarization:
    - Extraction-based summarization
    - Abstraction-based summarization

### Steps to Perform Text Summarization:
    1. Convert the paragraph into sentences.
    2. Perform text processing.
    3. Perform tokenization.
    4. Evaluated the weighted occurrence frequency of the words. 
    5. Substitute words with their weighted frequencies.

<br>

![](https://paper-attachments.dropbox.com/s_5DD7360138DEDEB8828AD11E4B5921DC0A55833560A1BC79C451FADB6E7D209D_1554467410003_image.png)

<br>

---

## PART 2: Breakdown of Code Constructs

### Step 1: Prepare the data.

In [45]:
PATH_DATA = "https://en.wikipedia.org/wiki/20th_century"

data_read = urlopen(PATH_DATA).read()
data_parsed = bs.BeautifulSoup(data_read, "html.parser")

data_paragraphs = data_parsed.find_all("p")

data_content = str()
for paragraph in paragraphs:
    data_content += paragraph.text

### Step 2: Process the data.

In [41]:
def create_frequency_table(text):
    """ Function to create frequency histogram of word occurrences across input text. """
    stop_words = set(stopwords.words("english"))
    raw_words_from_data = word_tokenize(text)
    stem = PorterStemmer()
    # Create frequency table via dictionary operations
    frequency_table = dict()
    for word in raw_words_from_data:
        word_root = stem.stem(word)
        if word_root in stop_words:
            continue
        if word_root in frequency_table:
            frequency_table[word_root] += 1
        else:
            frequency_table[word_root] = 1
    return frequency_table

### Step 3: Tokenize the article into sentences.

In [24]:
sentences = sent_tokenize(data_content)

### Step 4: Find the weighted frequencies of the sentences.

In [33]:
def calculate_sentence_scores(sentences, frequency_table, num_chars=7):
    """ Function to create weighted frequency scores from parsed sentences using frequency table. """
    sentence_weight = dict()
    for sentence in sentences:
        sentence_wordcount_without_stop_words = 0
        sentence_wordcount = (len(word_tokenize(sentence)))
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:num_chars] in sentence_weight:
                    sentence_weight[sentence[:num_chars]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:num_chars]] = frequency_table[word_weight]
        sentence_weight[sentence[:num_chars]] /= sentence_wordcount_without_stop_words
    return sentence_weight

### Step 5: Calculate the threshold of the sentences.

In [34]:
def calculate_average_threshold(sentence_weight):
    """ Function to get the average weighted score of a sentence. """
    sum_values = 0
    for element in sentence_weight:
        sum_values += sentence_weight[element]
    return (sum_values / len(sentence_weight))

### Step 6: Obtain the summary.

In [35]:
def get_text_summary(sentences, sentence_weight, threshold, num_chars=7):
    """ Function to create summary statement of article using weighted sentence data and relative threshold. """
    sentence_counter, article_summary = 0, str()
    for sentence in sentences:
        if sentence[:num_chars] in sentence_weight and sentence_weight[sentence[:num_chars]] >= (threshold):
            article_summary += " {}".format(sentence)
            sentence_counter += 1
    return article_summary

---

## PART 3: Analyzing our Data

We can wrap this all up into a nice outer function and run our summarization analysis on our sample Wikipedia and check our results!

Since this is extraction-based, it won't be nearly as nicely grammatical and well-structured as an abstraction-based (deep learning and advanced modeling) approach, but it should be sufficient to give us an adequate summary of the article's topic. 

In [36]:
def run_text_summary(text):
    frequency_table = create_frequency_table(text)
    sentences = sent_tokenize(text)
    sentence_scores = calculate_sentence_scores(sentences, frequency_table)
    threshold = calculate_average_threshold(sentence_scores)
    text_summary = get_text_summary(sentences, sentence_scores, 1.5 * threshold)
    return text_summary

In [42]:
run_text_summary(data_content)

" Terms like ideology, world war, genocide, and nuclear war entered common usage. Humans explored space for the first time, taking their first footsteps on the Moon. However, these same wars resulted in the destruction of the imperial system. The victorious Bolsheviks then established the Soviet Union, the world's first communist state. At the beginning of the period, the British Empire was the world's most powerful nation,[12] having acted as the world's policeman for the past century. In total, World War II left some 60 million people dead. With the Axis defeated and Britain and France rebuilding, the United States and the Soviet Union were left standing as the world's only superpowers. At the beginning of the century, strong discrimination based on race and sex was significant in general society. During the century, the social taboo of sexism fell. Communications and information technology, transportation technology, and medical advances had radically altered daily lives. With the e

---

## PART 4: Constructing a Higher-Order Object

In [18]:
class Text_Summarization_Engine(object):
    """ Class instance for producing extraction-based summaries from input corpus data. """
    def __init__(self, query=None):
        self.num_chars = 7
        if query is None:
            self.path = "https://en.wikipedia.org/wiki/Randomness"
        else:
            self.path = "https://en.wikipedia.org/wiki/{}".format(query)
        self.dataset = self._process_data()
            
    def _process_data(self):
        """ Instance method to load, clean, and parse linguistic data from raw text corpus. """
        data_read = urlopen(self.path).read()
        data_parsed = bs.BeautifulSoup(data_read, "html.parser")
        data_paragraphs = data_parsed.find_all("p")
        data_content = str()
        for paragraph in data_paragraphs:
            data_content += paragraph.text
        return data_content
            
    def _create_frequency_table(self):
        """ Instance method to create frequency histogram of word occurrences across input text. """
        stop_words = set(stopwords.words("english"))
        data_words, stem = word_tokenize(self.dataset), PorterStemmer()
        stem, frequency_table = PorterStemmer(), dict()
        for word in data_words:
            word_root = stem.stem(word)
            if word_root in stop_words:
                continue
            if word_root in frequency_table:
                frequency_table[word_root] += 1
            else:
                frequency_table[word_root] = 1
        return frequency_table
    
    def _calculate_sentence_weights(self, frequency_table, sentences):
        """ Instance method to create weighted frequency scores from parsed sentences using frequency table. """
        sentence_weights = dict()
        for sentence in sentences:
            sentence_wordcount_without_stop_words = 0
            sentence_wordcount = (len(word_tokenize(sentence)))
            for word_weight in frequency_table:
                if word_weight in sentence.lower():
                    sentence_wordcount_without_stop_words += 1
                    if sentence[:self.num_chars] in sentence_weights:
                        sentence_weights[sentence[:self.num_chars]] += frequency_table[word_weight]
                    else:
                        sentence_weights[sentence[:self.num_chars]] = frequency_table[word_weight]
            sentence_weights[sentence[:self.num_chars]] /= sentence_wordcount_without_stop_words
        return sentence_weights
    
    def _calculate_average_threshold(self, sentence_weights):
        """ Instance method to get the average weight across all sentences. """
        return sum(sentence_weights.values()) / len(sentence_weights)
    
    def _get_text_summary(self, sentences, sentence_weights, relative_threshold):
        """ Instance method to create summary statement of corpus using weighted sentence data and relative threshold. """
        sentence_counter, text_summary = 0, str()
        for sentence in sentences:
            if sentence[:self.num_chars] in sentence_weights and sentence_weights[sentence[:self.num_chars]] >= (relative_threshold):
                text_summary += " {}\n".format(sentence.encode("utf-8"))
                sentence_counter += 1
        return text_summary
    
    def run_text_summarization(self):
        """ Instance method to perform end-to-end text summarization analysis on parsed dataset. """
        frequency_table, sentences = self._create_frequency_table(), sent_tokenize(self.dataset)
        sentence_weights = self._calculate_sentence_weights(frequency_table, sentences)
        threshold = self._calculate_average_threshold(sentence_weights)
        return self._get_text_summary(sentences, sentence_weights, 1.5 * threshold)

Create terse formatting script for basic search queries in Wikipedia.

In [6]:
def format_search_query(query):
    """ Global function that formats and restructures basic search query from user to Wikipedia search. """
    return " ".join(word.capitalize() for word in query.split()).replace(" ", "_")

In [26]:
# NOTE: Input user-defined search query in Wikipedia here. 
query = "software engineering"

# Search query is refined in global formatting function.
query = format_search_query(query)

# Instantiate text summarization processor with user-defined search query
proc = Text_Summarization_Engine()

# Produce summary of relevant Wikipedia article
print(proc.run_text_summarization())

 The Greek philosophers discussed randomness at length, but only in non-quantitative forms.
 In the 1888 edition of his book The Logic of Chance, John Venn wrote a chapter on The conception of randomness that included his view of the randomness of the digits of pi, by using them to construct a random walk in two dimensions.
 In the mid- to late-20th century, ideas of algorithmic information theory introduced new dimensions to the field via the concept of algorithmic randomness.
 In the first six billion decimal places of pi, each of the digits from 0 through 9 shows up about six hundred million times.
 [17]In statistics, randomness is commonly used to create simple random samples.
 Noise consists of numerous transient disturbances, with a statistically randomized time distribution.
 If the universe is regarded to have a purpose, then randomness can be seen as impossible.
 In fact, randomness has been used for games of chance throughout history, and to select out individuals for an unwa

---

## PART 5: Introducing Abstractive Text Summarization

Abstractive summarization, unlike its extractive cousin, utilize **generative** training models to approximate and create new sentences from scratch rather than recombining old tokens from prior sentences.

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/05/abstractive1.jpg)

With text summarization, our input and output are the same: sequences of words. We can utilize `Seq2Seq` model architectures within the realm of deep learning to attack this problem.

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/05/final.jpg)

`Seq2Seq` models utilize *Encoder-Decoder* architectures to resolve the issue of encoding information and decoding results being of differing lengths. 

These architectures often use RNNs (Recurrent Neural Networks) and/or LSTMs (Long Short Term Memories) due to their proclivity for understanding long-term dependencies across sequential data. We'll be working with LSTMs.

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/06/first.jpg.jpg)

The encoder-decoder has two primary phases: **training** and **interference**. 

#### Training Phase

In an encoding LSTM, a word is inputted at each timestep into the pipeline and interpreted at every timestep across its parent sequence to better understand the context of the word. This way, the entire input sequence is interpreted both in short-term and longetive context. 

The final state of the encoder receives the hidden state (<i>h</i>) and cell state (<i>c</i>) weights that instantiate the decoder. 

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/05/61.jpg)

In a decoding LSTM, the target sequence is fed piecewise through the network, which attempts to predict the same sequence offset by a single timestep. In this way, the decoder will be predictive of each subsequent word in the sequence.

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/05/71.jpg)

#### Interference Phase

The interference phase architecture is largely similar to our training phase architecture, with the critical difference that input sequences are now **independent** of a target sequence. 

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/05/82.jpg)

The interference process can be outlined as a multi-step mechanism as follows:
    1. Encode the input sequence and instantiate the decoder with the final hidden and cell states of the encoder.
    2. Pass the *[START]* token as the first input of the decoder.
    3. Run the decoder for one timestep. (Output will be the probability for next word; word with highest probability is selected.)
    4. Pass sampled word as new decoder input with updated hidden and cell states from previous timestep.
    5. Repeat steps 3-4 until *[STOP]* token or maximum target sequence length is reached.

The encoder-decoder has one critical limitation: *it is unable to perform effectively with longer and longer sequences*.

This is where we make use of the **attention mechanism**. 

The attention mechanism modifies relative importance given to each word of an input sequence based on how relevant it is towards predicting the target sequence. In other words, it works so that the algorithm as a whole only has to look at a small set of words across a sequence rather than each word to predict an effective response.

The attention mechanism has two primary classes: **global attention** and **local attention**. 

#### Global Attention

All hidden states are considered for deriving the next context.

#### Local Attention

Only some hidden states are considered for deriving the next context.

We'll be working primarily with a *global attention mechanism*. 

## PART 6: Diving into Abstractive Code

Let's read in our dataset. 

In [9]:
FILEPATH, NROWS = "./datasets/amazon-fine-food-reviews/Reviews.csv", 1e5

dataset = pd.read_csv(FILEPATH, nrows=NROWS)

From here, we'll immediately get rid of those pesky `NaN` values and other erroneous data.

In [12]:
dataset.drop_duplicates(subset=["Text"], inplace=True)
dataset.dropna(axis=0, inplace=True)

Now we'll do some data preprocessing!

In [13]:
contraction_mapping = {"ain't": "is not", 
                       "aren't": "are not",
                       "can't": "cannot", 
                       "'cause": "because", 
                       "could've": "could have", 
                       "couldn't": "could not",
                       "didn't": "did not", 
                       "doesn't": "does not", 
                       "don't": "do not", 
                       "hadn't": "had not", 
                       "hasn't": "has not", 
                       "haven't": "have not",
                       "he'd": "he would",
                       "he'll": "he will", 
                       "he's": "he is", 
                       "how'd": "how did", 
                       "how'd'y": "how do you", 
                       "how'll": "how will", 
                       "how's": "how is",
                       "I'd": "I would", 
                       "I'd've": "I would have", 
                       "I'll": "I will", 
                       "I'll've": "I will have",
                       "I'm": "I am", 
                       "I've": "I have", 
                       "i'd": "i would",
                       "i'd've": "i would have", 
                       "i'll": "i will",  
                       "i'll've": "i will have",
                       "i'm": "i am", 
                       "i've": "i have", 
                       "isn't": "is not", 
                       "it'd": "it would",
                       "it'd've": "it would have", 
                       "it'll": "it will", 
                       "it'll've": "it will have",
                       "it's": "it is", 
                       "let's": "let us", 
                       "ma'am": "madam",
                       "mayn't": "may not", 
                       "might've": "might have",
                       "mightn't": "might not",
                       "mightn't've": "might not have", 
                       "must've": "must have",
                       "mustn't": "must not", 
                       "mustn't've": "must not have", 
                       "needn't": "need not", 
                       "needn't've": "need not have",
                       "o'clock": "of the clock",
                       "oughtn't": "ought not", 
                       "oughtn't've": "ought not have", 
                       "shan't": "shall not", 
                       "sha'n't": "shall not", 
                       "shan't've": "shall not have",
                       "she'd": "she would", 
                       "she'd've": "she would have", 
                       "she'll": "she will", 
                       "she'll've": "she will have", 
                       "she's": "she is",
                       "should've": "should have", 
                       "shouldn't": "should not", 
                       "shouldn't've": "should not have", 
                       "so've": "so have",
                       "so's": "so as",
                       "this's": "this is",
                       "that'd": "that would", 
                       "that'd've": "that would have", 
                       "that's": "that is", 
                       "there'd": "there would",
                       "there'd've": "there would have", 
                       "there's": "there is", 
                       "here's": "here is",
                       "they'd": "they would", 
                       "they'd've": "they would have",
                       "they'll": "they will", 
                       "they'll've": "they will have", 
                       "they're": "they are", 
                       "they've": "they have", 
                       "to've": "to have",
                       "wasn't": "was not", 
                       "we'd": "we would", 
                       "we'd've": "we would have", 
                       "we'll": "we will", 
                       "we'll've": "we will have", 
                       "we're": "we are",
                       "we've": "we have", 
                       "weren't": "were not", 
                       "what'll": "what will", 
                       "what'll've": "what will have", 
                       "what're": "what are",
                       "what's": "what is", 
                       "what've": "what have", 
                       "when's": "when is", 
                       "when've": "when have", 
                       "where'd": "where did", 
                       "where's": "where is",
                       "where've": "where have", 
                       "who'll": "who will", 
                       "who'll've": "who will have", 
                       "who's": "who is", 
                       "who've": "who have",
                       "why's": "why is", 
                       "why've": "why have", 
                       "will've": "will have", 
                       "won't": "will not", 
                       "won't've": "will not have",
                       "would've": "would have", 
                       "wouldn't": "would not", 
                       "wouldn't've": "would not have", 
                       "y'all": "you all",
                       "y'all'd": "you all would",
                       "y'all'd've": "you all would have",
                       "y'all're": "you all are",
                       "y'all've": "you all have",
                       "you'd": "you would", 
                       "you'd've": "you would have", 
                       "you'll": "you will", 
                       "you'll've": "you will have",
                       "you're": "you are", 
                       "you've": "you have"}

We'll need to perform the following preprocessing tasks:
    - Convert everything to lowercase.
    - Remove HTML tags.
    - Contraction mapping.
    - Remove ('s). 
    - Remove any text inside parentheses.
    - Eliminate punctuation and special characters.
    - Remove stop words.
    - Remove short words.

In [17]:
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = bs.BeautifulSoup(text, "lxml").text
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"','', text)
    text = " ".join([contraction_mapping[element] if element in contraction_mapping else element for element in text.split(" ")])    
    text = re.sub(r"'s\b","",text)
    text = re.sub("[^a-zA-Z]", " ", text)
    
    tokens = [word for word in text.split() if not word in stop_words]
    long_words = list()
    
    for token in tokens:
        if len(token) >= 3:
            long_words.append(token)
    return (" ".join(long_words)).strip()

text_cleaned = list()
for item in dataset["Text"]:
    text_cleaned.append(clean_text(item))

In [18]:
dataset["Summary"][:10]

0                            Good Quality Dog Food
1                                Not as Advertised
2                            "Delight" says it all
3                                   Cough Medicine
4                                      Great taffy
5                                       Nice Taffy
6    Great!  Just as good as the expensive brands!
7                           Wonderful, tasty taffy
8                                       Yay Barley
9                                 Healthy Dog Food
Name: Summary, dtype: object

---