#### Extractive text summary

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import re

In [2]:
data= pd.read_csv('C:/Users/User/Desktop/Text Summary/data/data_news.csv')
data.head(4)

Unnamed: 0,category,summary,text
0,entertainment,Nigel McCune from the Musicians' Union said Br...,Musicians to tackle US red tape Musicians' gr...
1,entertainment,But they still want more.They have to want to ...,"U2's desire to be number one U2, who have won..."
2,entertainment,"Babyshambles, which he formed after his acrimo...",Rocker Doherty in on-stage fight Rock singer ...
3,entertainment,A Series of Unfortunate Events also stars Scot...,Snicket tops US box office chart The film ada...


In [3]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [4]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [5]:
def generate_summary(text, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []
    
    # Step 1 - Read text and tokenize
    sentences =  sent_tokenize(text)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)

    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)
    
    for i in range(top_n):
        summarize_text.append(ranked_sentence[i][1])

    # Step 5 - Offcourse, output the summarize text
    return " ".join(summarize_text)

In [6]:
# get data from df
all_news_text = [news for news in data.text.values]
all_summary_text = [summary for summary in data.summary.values]

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
# let's begin

n = 15

print(f"""
      Original Text: 
      
      {all_news_text[n]}
      
      Original Summary: 
      
      {all_summary_text[n]}
      
      Generated Summary: 
      
      {generate_summary(all_news_text[n], 5)}
      
      """)


      Original Text: 
      
      Rapper Snoop Dogg sued for 'rape'  US rapper Snoop Dogg has been sued for $25m (£13m) by a make-up artist who claimed he and his entourage drugged and raped her two years ago.  The woman said she was assaulted after a recording of the Jimmy Kimmel Live TV show on the ABC network in 2003. The rapper's spokesman said the allegations were "untrue" and the woman was "misusing the legal system as a means of extracting financial gain". ABC said the claims had "no merit". The star has not been charged by police.  The lawsuit, filed in Los Angeles on Friday, says the woman's champagne was spiked and she was then assaulted. The rapper's spokesperson said: "Snoop will have the opportunity to prove in a court of law that [the alleged victim] is opportunistic and deceitful. "We are confident that in this case, [the alleged victim's] claims against Snoop Dogg will be rejected." The lawsuit names Snoop Dogg - real name Calvin Broadus - plus three associates, The W