In [1]:
import nltk
nltk.download('punkt') # one time execution
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer

from scipy.spatial.distance import cosine as cosine_diference

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import numpy as np
import pandas as pd
import networkx as nx

[nltk_data] Downloading package punkt to /Users/morgnic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
class TextSummarizer():
    def __init__(self, article):
        def generate_clean_sentences():
            """
            Purpose:
                1) Use nltk.sent_tokenize to split article into sentences
                2) Remove punctuation with nltk.RegExpTokenizer
                3) Remove sentences with less than 3 words
            """
            self.sentences = pd.Series(sent_tokenize(self.article))
            #not to remove numbers, nor periods & percentage signs, otherwise 1.5C would become 15c and 1% chance becomes 1 chance        
            tokenizer = RegexpTokenizer(r'\w+')
            clean_sentences = self.sentences.apply(
                lambda x: ' '.join(tokenizer.tokenize(x)))
            
            # remove short sentences
            msk = clean_sentences.str.len() >= 3
            self.clean_sentences = clean_sentences[msk]
            
        def rank_sentences():
            """
            Purpose:
                1) Use cosine similarity to rank sentence similarity
                2) Use PageRank algorithm via networkx to rank most important sentences
            """
            cv = CountVectorizer(stop_words='english')
            sentenceVectors = cv.fit_transform(self.clean_sentences)

            self.similarityMatrix = cosine_similarity(sentenceVectors,sentenceVectors)

            nx_graph = nx.from_numpy_array(self.similarityMatrix)
            scoreDict = nx.pagerank(nx_graph)
            self.scores = np.array(list(scoreDict.values()))
            
        
        self.article = article
        generate_clean_sentences()
        rank_sentences()
        
    def return_top_sentences(self, n):
        """
        Returns the top n sentences according to PageRank score on cosine similarity matrix
        """
        idx = self.scores.argsort()[:n]
        return [self.sentences[i] for i in idx]

        

In [3]:
def summarize_article(article, headline, n_sentences):
    ts = TextSummarizer(article)
    top = ts.return_top_sentences(n_sentences)
    print(headline,'\n')
    for sentence in top:
        print('\n\n\t', sentence)

In [4]:
df = pd.read_csv('data/articles1.csv', encoding='utf-8')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [6]:
idx = 0
article = df.loc[idx, 'content']
headline = df.loc[idx, 'title']


summarize_article(article, headline, 3)

House Republicans Fret About Winning Their Health Care Suit - The New York Times 



	 Insurers that receive the subsidies in exchange for paying    costs such as deductibles and   for eligible consumers could race to drop coverage since they would be losing money.


	 The Justice Department, confident that Judge Collyer’s decision would be reversed, quickly appealed, and the subsidies have remained in place during the appeal.


	 ” Republican leadership officials in the House acknowledge the possibility of “cascading effects” if the   payments, which have totaled an estimated $13 billion, are suddenly stopped.


In [7]:
idx = 5

article = df.loc[idx, 'content']
headline = df.loc[idx, 'title']


summarize_article(article, headline, 3)

Sick With a Cold, Queen Elizabeth Misses New Year’s Service - The New York Times 



	 The queen, who ascended to the throne in 1952, became the world’s   monarch following the death of King Bhumibol Adulyadej of Thailand in October.


	 She is also Britain’s   monarch, having last year surpassed Queen Victoria’s   reign.


	 The queen’s husband, Prince Philip, who had also been ill, was well enough to attend both services, in the church at Sandringham, which is in Norfolk, on the east coast of England.


In [8]:
idx = 77

article = df.loc[idx, 'content']
headline = df.loc[idx, 'title']


summarize_article(article, headline, 3)

Enough With the Tweets, China’s State Media Tells Trump - The New York Times 



	 And while Chinese politicians love slogans, they prefer to communicate with foreign leaders through long, tranquilizing disquisitions.


	 Open sarcasm is rare.


	 The service has been banned in China since 2009, though residents find ways to poke through the firewall of censorship.


In [9]:
idx = 97

article = df.loc[idx, 'content']
headline = df.loc[idx, 'title']


summarize_article(article, headline, 3)

L.I.R.R. Train That Crashed Was Going Over Twice Speed Limit, Inquiry Finds - The New York Times 



	 Officials at the authority are working to meet a 2018 deadline to install the technology.


	 The train rammed into a bumping block during the morning rush, striking a room beyond the track and causing the first two cars to derail.


	 They also intend to interview two railroad employees who witnessed the crash.


In [10]:
idx = 170

article = df.loc[idx, 'content']
headline = df.loc[idx, 'title']


summarize_article(article, headline, 3)

Death of Iran’s Rafsanjani Removes Influential Voice Against Hard-Liners - The New York Times 



	 “Many worked with him because of that support.


	 Who would now warn publicly against “Islamic fascism,” when the   sought to influence elections?


	 His death also reflects the dwindling number of leaders from the generation that overthrew the shah nearly four decades ago.
