## Importing Necessary Packages

In [1]:
# For collecting data from URL
import requests, bs4 

# To collect text from a file
from os.path import abspath

# Basic libraries
import numpy as np, pandas as pd, re
from math import log

# Summarization
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

import networkx as nx

from gensim import corpora
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from pylcs import lcs
from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim.summarization import bm25
from rank_bm25 import BM25Plus

In [2]:
# Downloading the essential NLTK modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gulva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gulva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gulva\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gulva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## User Input Functions

In [3]:
def take_url():
    url = input("Enter the url :")
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.content, "html.parser")
    data = soup.text
    #print(data)
    return data

In [4]:
def file_input():
    file_location = input("Enter the path of file :")
    print(f"\nReading the file: {file_location}")
    with open(abspath(file_location), encoding="utf8") as f:
        data = "\n".join(f.readlines())
    return data

In [5]:
def take_input():
    print("Enter the method for text input :")
    print(" 1. From URL \n 2. From text file \n 3. Direct input \n Enter choice :")

    try:
        choice = int(input())
    except:
        return -1

    if(choice == 1):
        return take_url()
    
    elif(choice == 2):
        return file_input()
    
    elif(choice == 3):
        text = input("Enter the text : ")
        return text
    
    else:
        return -1

## POS Tagging Function

In [6]:
def get_wordnet_pos(word):
    
    # Map POS tag to first character lemmatize() accepts
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

## Similarity Matrix Functions

In [7]:
def similarity_original(text_data):
    
    sim = np.zeros([len(text_data), len(text_data)]) # Initialization
    for i, sentence_1 in enumerate(text_data):
        for j, sentence_2 in enumerate(text_data):
            sent_1 = set(sentence_1) # Unique words
            sent_2 = set(sentence_2)

            if(i == j):
                sim[i][j] = 0
            else:
                common = float(len(list(sent_1 & sent_2)))
                if(len(sentence_1) and len(sentence_2) > 1):
                    denominator = float(log(len(sentence_1)) + log(len(sentence_2)))
                else:
                    denominator = 1.0
                
                sim[i][j] = common / denominator
    return sim

In [8]:
def similarity_bm(text_data):
    
    dictionary = corpora.Dictionary(text_data) # BAG_OF_WORDS MODEL
    corpus = [dictionary.doc2bow(text) for text in text_data]
    # bm25_obj = bm25.BM25(corpus) #object
    bm25_obj = BM25Plus(corpus)

    similarity = []

    for i, sentence in enumerate(text_data):
        query = dictionary.doc2bow(sentence)
        score = bm25_obj.get_scores(query)
        similarity.append(score)
    
    sim = np.array(similarity)    
    return sim

## Main Summarization Function

In [9]:
def summarize(string, ratio = 0.2): 
    
    # Tokenization
    sentences = sent_tokenize(string) 
    sentences_clean = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]

    # Stop words removal
    stop_words = stopwords.words('english')
    sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]

    # POS Tagging and Lemmatization
    text_data = []
    lemmatizer = WordNetLemmatizer() #object
    count = 0
    
    for sentence in sentence_tokens:
        sample_list = [word for word in sentence if word]
        tags = pos_tag(sample_list)
        line = []
        for word, tag in tags: 
            count += 1
            line.append(lemmatizer.lemmatize(word, pos = get_wordnet_pos(tag)))
        text_data.append(line) 

    # Similarity Matrix
    sim_a = similarity_original(text_data)
    sim_d = similarity_bm(text_data)

    # Normalization
    sim_a = sim_a / sim_a.max()    
    sim_d = sim_d / sim_d.max()    
    
    # Combination
    similarity_matrix = (sim_a + sim_d)
    
    # Page Rank
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, max_iter = 600)

    # Best sentences
    top_sentence = {sentence:scores[index] for index,sentence in enumerate(sentences)}
    number = int(len(sentence_tokens)*(ratio))
    top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:number])
    text_list = []

    for sent in sentences:
        if sent in top.keys():
            text_list.append(sent)

    summary = "\n".join(text_list)
    return summary

## Main

In [10]:
def main():
    text = take_input()
    if (text == -1):
        print("Wrong Input")
    else:
        summary = summarize(text)
        print("\nSummary :\n", summary)

In [11]:
if __name__ == "__main__":
    main()

Enter the method for text input :
 1. From URL 
 2. From text file 
 3. Direct input 
 Enter choice :

Reading the file: trial_run/french_revolution.txt

Summary :
 Faced with the heavy expenditure that the wars of the 18th century entailed, the rulers of Europe sought to raise money by taxing the nobles and clergy, who in most countries had hitherto been exempt, To justify this, the rulers likewise invoked the arguments of advanced thinkers by adopting the role of “enlightened despots.” This provoked reaction throughout Europe from the privileged bodies, diets.
Although scholarly debate continues about the exact causes of the Revolution, the following reasons are commonly adduced: (1) the bourgeoisie resented its exclusion from political power and positions of honour; (2) the peasants were acutely aware of their situation and were less and less willing to support the anachronistic and burdensome feudal system; (3) the philosophes had been read more widely in France than anywhere else;