In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request  
import ssl
ssl._create_default_https_context = ssl._create_unverified_context



In [2]:
#fetching the content from the URL
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/20th_century')
article_read = fetched_data.read()

#parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')

In [3]:
#returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

#looping through the paragraphs and adding them to the variable
for p in paragraphs:  
    article_content += p.text

In [4]:
def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1
       
    return frequency_table


In [5]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words

       

    return sentence_weight


In [6]:
def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [7]:
def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

In [8]:

from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)


def normalize_whitespace(text):
    """
    Translates multiple whitespace into single space character.
    If there is at least one new line character chunk is replaced
    by single LF (Unix new line) character.
    """
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)


def _replace_whitespace(match):
    text = match.group()

    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "


def is_blank(string):
    """
    Returns `True` if string contains only white-space characters
    or is empty. Otherwise `False` is returned.
    """
    return not string or string.isspace()


def get_symmetric_matrix(matrix):
    """
    Get Symmetric matrix
    :param matrix:
    :return: matrix
    """
    return matrix + matrix.T - np.diag(matrix.diagonal())


def core_cosine_similarity(vector1, vector2):
    """
    measure cosine similarity between two vectors
    :param vector1:
    :param vector2:
    :return: 0 < cosine similarity value < 1
    """
    return 1 - cosine_distance(vector1, vector2)


'''
Note: This is not a summarization algorithm. This Algorithm pics top sentences irrespective of the order they appeared.
'''


class TextRank4Sentences():
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stopwords=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        # Get Symmeric matrix
        sm = get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is ignore the 0 element in norm

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector

    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):

        top_sentences = []

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(number):
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences.append(sent)
                index += 1

        return top_sentences

    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]

        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)

        self.pr_vector = self._run_page_rank(similarity_matrix)


In [9]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)
 
    #tokenizing the sentences
    sentences = sent_tokenize(article)
    
    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, 1.5 * threshold)

    return article_summary,frequency_table,sentence_scores,threshold

In [10]:
if __name__== '__main__':
    summary_results,freq_table,sent_score,avg_score = _run_article_summary(article_content)
    print(summary_results)
    trial = TextRank4Sentences()
    trial.analyze(article_content)
    topn=trial.get_top_sentences(5)
    summary_topn=''
    for sentence in topn:
            summary_topn += " " + sentence
    print(summary_topn)

 Humans explored space for the first time, taking their first footsteps on the Moon. However, these same wars resulted in the destruction of the imperial system. The victorious Bolsheviks then established the Soviet Union, the world's first communist state. At the beginning of the period, the British Empire was the world's most powerful nation,[14] having acted as the world's policeman for the past century. In total, World War II left some 60 million people dead. With the Axis defeated and Britain and France rebuilding, the United States and the Soviet Union were left standing as the world's only superpowers. At the beginning of the century, strong discrimination based on race and sex was significant in general society. During the century, the social taboo of sexism fell. With the end of colonialism and the Cold War, nearly a billion people in Africa were left in new nation states after centuries of foreign domination. The world was undergoing its second major period of globalization; 

In [11]:
print(freq_table)

{'20th': 21, '(': 12, 'twentieth': 2, ')': 12, 'centuri': 47, 'began': 4, 'januari': 3, '1': 6, ',': 306, '1901': 3, 'mcmi': 1, 'end': 15, 'decemb': 3, '31': 3, '2000': 3, 'mm': 1, '.': 126, '[': 39, ']': 39, 'wa': 35, 'domin': 3, 'signific': 4, 'event': 1, 'defin': 1, 'era': 1, ':': 4, 'spanish': 1, 'flu': 1, 'pandem': 1, 'world': 48, 'war': 39, 'ii': 8, 'nuclear': 10, 'weapon': 6, 'power': 12, 'space': 4, 'explor': 3, 'nation': 18, 'decolon': 3, 'technolog': 21, 'advanc': 8, 'cold': 5, 'post-cold': 1, 'conflict': 8, 'saw': 2, 'massiv': 2, 'transform': 2, 'order': 2, 'global': 17, 'total': 3, 'fertil': 1, 'rate': 1, 'sea': 1, 'level': 2, 'rise': 3, 'ecolog': 2, 'collaps': 3, 'increas': 3, ';': 9, 'result': 10, 'competit': 3, 'land': 1, 'dwindl': 1, 'resourc': 3, 'acceler': 2, 'deforest': 1, 'water': 1, 'deplet': 1, 'mass': 3, 'extinct': 2, 'mani': 14, "'s": 20, 'speci': 1, 'declin': 2, 'popul': 11, 'consequ': 1, 'dealt': 1, 'man-mad': 1, 'warm': 2, 'extrem': 1, 'weather': 1, 'condit':

In [12]:
print(sent_score)

{'The 20t': 16.101888020833336, '[1]  Th': 19.875, 'Man-mad': 20.0, 'Additio': 17.88372093023256, 'Automob': 26.166666666666668, 'Great a': 25.90909090909091, 'The rep': 27.0, 'The Mar': 10.45945945945946, 'Through': 18.939393939393938, 'The dis': 16.28205128205128, 'It took': 14.877551020408163, '[6][7][': 19.416666666666668, 'Penicil': 10.818181818181818, '[9] Mac': 19.303030303030305, 'Trade i': 17.482758620689655, 'Until t': 16.289473684210527, '[10]\nTh': 28.666666666666668, '[11] Th': 27.65, 'It was ': 22.958333333333332, 'Unlike ': 27.68421052631579, 'The cen': 29.067857142857143, 'Nationa': 18.71875, 'Terms l': 28.59090909090909, 'Scienti': 23.25739644970414, 'Horses ': 20.56, 'These d': 18.214285714285715, 'Humans ': 47.8, 'Mass me': 19.70967741935484, 'Advance': 11.9, 'Rapid t': 30.157894736842106, 'World W': 26.12, 'However': 52.6, 'For the': 26.56, 'The las': 24.136363636363637, '[12]\nTh': 20.82051282051282, 'Technol': 24.0, 'After m': 16.238095238095237, 'In addi': 19.038

In [13]:
print(avg_score)

23.316197406733412


In [24]:
from rouge import Rouge 
rouge = Rouge()
scores = rouge.get_scores(summary_results, article_content)
print(scores)


[{'rouge-1': {'f': 0.15366772835813847, 'p': 1.0, 'r': 0.08322864321608041}, 'rouge-2': {'f': 0.14737452716112168, 'p': 0.9621212121212122, 'r': 0.07979893182532202}, 'rouge-l': {'f': 0.21442246139134, 'p': 1.0, 'r': 0.12008577555396711}}]
