In [1]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    GIT_ROOT = 'https://github.com/furyhawk/text_summarization/raw/master'
    os.system(f'wget {GIT_ROOT}/notebooks/setup.py')

%run -i setup.py

You are working on a local system.
Files will be searched relative to "..".


In [2]:
%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'png'

# to print output of all statements and not just the last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# otherwise text between $ signs will be interpreted as formula and printed in italic
pd.set_option('display.html.use_mathjax', False)

# path to import blueprints packages
sys.path.append(BASE_DIR + '/packages')

In [3]:
# adjust matplotlib resolution
matplotlib.rcParams.update({'figure.dpi': 200 })

In [4]:
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import matplotlib.pyplot as plt
import html
import re
import random
import rouge_score
import wikipediaapi

In [5]:
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser
import pandas as pd
import numpy as np

In [6]:
pd.options.display.max_colwidth = 150 ###
file = "un-general-debates-blueprint.csv"
file = f"{BASE_DIR}/data/un-general-debates/un-general-debates.csv.zip" ### real location
df = pd.read_csv(file)
df.sample(2, random_state=53)

Unnamed: 0,session,year,country,text
3871,42,1987,HND,"﻿On behalf of the Government of Honduras I extend to you, Sir, sincere congratulations on your election as President of the forty-second session o..."
4697,35,1980,THA,﻿My delegation takes pleasure in extending to Mr. Riidiger von Wechmar its sincere congratulations on his unanimous election to the presidency of ...


# Summarizing text using topic representation
## Identifying important words with TF-IDF values


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
nltk.download('punkt')


sample_text = df['text'].iloc[1]
print("\nSample:", sample_text)
sentences = tokenize.sent_tokenize(sample_text)
tfidfVectorizer = TfidfVectorizer()
words_tfidf = tfidfVectorizer.fit_transform(sentences)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\furyx\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True


Sample: ﻿
May I begin by congratulating you. Sir, on your election to the presidency of the General Assembly at its forty-fourth session. Your wide experience in the United Nations makes you especially qualified to guide the Assembly and its deliberations. You have my best wishes for the demanding task that lies ahead of you. May I also express my thanks to the outgoing President, Mr. Dante Caputo, who so ably fulfilled his role on behalf of the Assembly. It gives me particular pleasure to pay tribute to the Secretary-General. We owe a great deal to his authority and leadership.
As we are assembled here for the annual general debate, every Member state needs to reflect on certain fundamental questions. Is the United Nations doing everything possible in order to meet its challenges? Are we making progress? Can we say that the United Nations enters the last decade of this century with determination and justified self-confidence? Is my own country doing everything it can in order to help

In [8]:
# Parameter to specify number of summary sentences required
num_summary_sentence = 3

# Sort the sentences in descending order by the sum of TF-IDF values
sent_sum = words_tfidf.sum(axis=1)
important_sent = np.argsort(sent_sum, axis=0)[::-1]

# Print three most important sentences in the order they appear in the article
for i in range(0, len(sentences)):
    if i in important_sent[:num_summary_sentence]:
        print (sentences[i])

Both industrialized and developing countries are directly affected by the depletion of the ozone layer through increasing emissions of carbon dioxides, which produce the so-called greenhouse effect in the climate of the Earth, and by the increasing presence of substances in the atmosphere that cause acid rain and direct dangers to health.
At the Conference the representatives of the more than 80 States present agreed unanimously to the Helsinki Declaration on the Protection of  the Ozone Layer, which contains an ambitious undertaking to phase out completely the use of ozone-depleting chlorofluorocarbons by the year 2000, or sooner if possible, and to phase out other ozone-depleting substances as soon as possible.
It is still too early to draw conclusions from the whole process of transition to independence, but we believe that the programme has been brought back on track thanks to the persistent efforts of the Secretary-General and the competent performance of the UNTAG personnel.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize

def tfidf_summary(text, num_summary_sentence):
    summary_sentence = []
    sentences = tokenize.sent_tokenize(text)
    tfidfVectorizer = TfidfVectorizer()
    words_tfidf = tfidfVectorizer.fit_transform(sentences)
    sentence_sum = words_tfidf.sum(axis=1)
    important_sentences = np.argsort(sentence_sum, axis=0)[::-1]
    for i in range(0, len(sentences)):
        if i in important_sentences[:num_summary_sentence]:
            summary_sentence.append(sentences[i])
    return summary_sentence

In [16]:
from preparation import clean

# Parameter to specify number of summary sentences required
num_summary_sentence = 5

where = df['country'].isin(['SGP'])
sgp_df = data=df[where]
sgp_df.head()

sample_text = clean(df.query("year==2014 and country=='SGP'")['text'].values[0])
test_text = tfidf_summary(sample_text, num_summary_sentence)

print("\ntest text: ", test_text)

Unnamed: 0,session,year,country,text
64,44,1989,SGP,"﻿First of all, let me congratulate you on your election to the presidency of the General Assembly at its forty-fourth session. Your unique and ill..."
160,25,1970,SGP,"Mr. President, instead of offering you my congratulations as preceding speakers have done, I wish to thank you on behalf of my delegation. We tha..."
240,68,2013,SGP,I warmly \ncongratulate Mr. John Ashe on his election as President \nat the sixty-eighth session of the General Assembly and \nwish him success du...
449,40,1985,SGP,Allow me to congratulate the President on his assumption of this high office. There are few men in this hall who have been associated with the Uni...
661,63,2008,SGP,The collapse of the recent \nDoha talks is deeply troubling. The Doha Development \nRound was the first round of talks to be launched after \nthe ...



test text:  ['Security Council resolution 2178 (2014), on foreign terrorist fighters, which Singapore co-sponsored, is an important step to combat global terrorism and will be critical to cutting off financial and material support for ISIS and preventing the movement of foreign terrorist fighters.', 'Regarding the principles for sustainable development, Singapore is participating actively in the discussions for the post-2015 period, focusing on those areas where we can contribute — key principles that were critical to our development path and might be relevant to other developing countries.', 'As noted on page 17 of the report of the High-level Panel of Eminent Persons on the Post-2015 Development Agenda, cities are where the battle for sustainable development will be won or lost.', 'This year, we will observe that Day together with relevant United Nations agencies and civil society organizations by focusing on the serious problems faced by women and girls who lack basic sanitation ac

In [34]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from sumy.summarizers.lsa import LsaSummarizer

LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)

parser = PlaintextParser.from_string(article1['text'], Tokenizer(LANGUAGE))
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

LONDON/SAN FRANCISCO (Reuters) - U.S. President Donald Trump has blocked microchip maker Broadcom Ltd’s (AVGO.O) $117 billion takeover of rival Qualcomm (QCOM.O) amid concerns that it would give China the upper hand in the next generation of mobile communications, or 5G.
Moving to new networks promises to enable new mobile services and even whole new business models, but could pose challenges for countries and industries unprepared to invest in the transition.
The concern is that a takeover by Singapore-based Broadcom could see the firm cut research and development spending by Qualcomm or hive off strategically important parts of the company to other buyers, including in China, U.S. officials and analysts have said.


In [35]:

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer

def lsa_summary(text, num_summary_sentence):
    summary_sentence = []
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

In [24]:
def download_article(url):
    # check if article already there
    filename = url.split("/")[-1] + ".html"
    filename = f"{BASE_DIR}/notebooks/" + filename
    if not os.path.isfile(filename):
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)
    return filename


In [25]:
def parse_article(article_file):
    with open(article_file, "r") as f:
        html = f.read()
    r = {}
    soup = BeautifulSoup(html, 'html.parser')
    r['id'] = soup.select_one("div.StandardArticle_inner-container")['id']
    r['url'] = soup.find("link", {'rel': 'canonical'})['href']
    r['headline'] = soup.h1.text
    r['section'] = soup.select_one("div.ArticleHeader_channel a").text
    
    r['text'] = soup.select_one("div.StandardArticleBody_body").text
    r['authors'] = [a.text 
                    for a in soup.select("div.BylineBar_first-container.ArticleHeader_byline-bar\
                                          div.BylineBar_byline span")]
    r['time'] = soup.find("meta", { 'property': "og:article:published_time"})['content']
    return r

In [28]:
import reprlib
r = reprlib.Repr()
r.maxstring = 800

url1 = "https://www.reuters.com/article/us-qualcomm-m-a-broadcom-5g/what-is-5g-and-who-are-the-major-players-idUSKCN1GR1IN"
article_name1 = download_article(url1)
article1 = parse_article(article_name1)
print ('Article Published on', r.repr(article1['time']))
print (r.repr(article1['text']))

Article Published on '2018-03-15T11:36:28+0000'
'LONDON/SAN FRANCISCO (Reuters) - U.S. President Donald Trump has blocked microchip maker Broadcom Ltd’s (AVGO.O) $117 billion takeover of rival Qualcomm (QCOM.O) amid concerns that it would give China the upper hand in the next generation of mobile communications, or 5G. A 5G sign is seen at the Mobile World Congress in Barcelona, Spain February 28, 2018. REUTERS/Yves HermanBelow are some facts... 4G wireless and looks set to top the list of patent holders heading into the 5G cycle. Huawei, Nokia, Ericsson and others are also vying to amass 5G patents, which has helped spur complex cross-licensing agreements like the deal struck late last year Nokia and Huawei around handsets. Editing by Kim Miyoung in Singapore and Jason Neely in LondonOur Standards:The Thomson Reuters Trust Principles.'


In [29]:
print(article1['text'])



LONDON/SAN FRANCISCO (Reuters) - U.S. President Donald Trump has blocked microchip maker Broadcom Ltd’s (AVGO.O) $117 billion takeover of rival Qualcomm (QCOM.O) amid concerns that it would give China the upper hand in the next generation of mobile communications, or 5G. A 5G sign is seen at the Mobile World Congress in Barcelona, Spain February 28, 2018. REUTERS/Yves HermanBelow are some facts about 5G and major players. WHAT IS 5G? 5G networks, now in the final testing stage, will rely on denser arrays of small antennas and the cloud to offer data speeds up to 50 or 100 times faster than current 4G networks and serve as critical infrastructure for a range of industries. Deals to start building mass-market 5G networks are still largely a year away, but by 2025, 1.2 billion people are set to have access to 5G networks - a third of them in China, according to the global wireless trade group GSMA. Moving to new networks promises to enable new mobile services and even whole new business m

In [30]:

r.maxstring = 800
url2 = "https://www.reuters.com/article/us-usa-economy-watchlist-graphic/predicting-the-next-u-s-recession-idUSKCN1V31JE"
article_name2 = download_article(url2)
article2 = parse_article(article_name2)
print ('Article Published', r.repr(article1['time']))
print (r.repr(article2['text']))


Article Published '2018-03-15T11:36:28+0000'
'NEW YORK A protracted trade war between China and the United States, the world’s largest economies, and a deteriorating global growth outlook has left investors apprehensive about the end to the longest expansion in American history. FILE PHOTO: Ships and shipping containers are pictured at the port of Long Beach in Long Beach, California, U.S., January 30, 2019.   REUTERS/Mike BlakeThe recent ...hton wrote in the June Cass Freight Index report.  12. MISERY INDEX The so-called Misery Index adds together the unemployment rate and the inflation rate. It typically rises during recessions and sometimes prior to downturns. It has slipped lower in 2019 and does not look very miserable.  Reporting by Saqib Iqbal Ahmed; Editing by Chizu NomiyamaOur Standards:The Thomson Reuters Trust Principles.'


In [39]:

summary_sentence = tfidf_summary(sample_text, num_summary_sentence)
for sentence in summary_sentence:
    print (sentence)

Security Council resolution 2178 (2014), on foreign terrorist fighters, which Singapore co-sponsored, is an important step to combat global terrorism and will be critical to cutting off financial and material support for ISIS and preventing the movement of foreign terrorist fighters.
Regarding the principles for sustainable development, Singapore is participating actively in the discussions for the post-2015 period, focusing on those areas where we can contribute — key principles that were critical to our development path and might be relevant to other developing countries.
This year, we will observe that Day together with relevant United Nations agencies and civil society organizations by focusing on the serious problems faced by women and girls who lack basic sanitation access.


In [40]:
from sumy.summarizers.text_rank import TextRankSummarizer

parser = PlaintextParser.from_string(sample_text, Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

Regarding the principles for sustainable development, Singapore is participating actively in the discussions for the post-2015 period, focusing on those areas where we can contribute — key principles that were critical to our development path and might be relevant to other developing countries.
The outcome document of the United Nations Conference on Sustainable Development (resolution 66/288, annex) itself recognizes that well-planned cities promote economically, socially and environmentally sustainable societies.
In 1992, we set up the Singapore Cooperation Programme to share our development experience, the successes and the failures alike, with other developing countries.


In [38]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.text_rank import TextRankSummarizer

def textrank_summary(text, num_summary_sentence):
    summary_sentence = []
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

In [42]:
parser = PlaintextParser.from_string(article1['text'], Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

Acquiring Qualcomm would represent the jewel in the crown of Broadcom’s portfolio of communications chips, which supply wi-fi, power management, video and other features in smartphones alongside Qualcomm’s core baseband chips - radio modems that wirelessly connect phones to networks.
Qualcomm (QCOM.O) is the dominant player in smartphone communications chips, making half of all core baseband radio chips in smartphones.
Slideshow (2 Images)The standards are set by a global body to ensure all phones work across different mobile networks, and whoever’s essential patents end up making it into the standard stands to reap huge royalty licensing revenue streams.


# Measuring the performance of Text Summarization methods

In [44]:
def print_rouge_score(rouge_score):
    for k,v in rouge_score.items():
        print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure))

In [45]:
num_summary_sentence = 3 ##
gold_standard = article2['headline']
summary = ""

summary = ''.join(textrank_summary(article2['text'], num_summary_sentence))
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scores = scorer.score(gold_standard, summary)
print_rouge_score(scores)

rouge1 Precision: 0.06 Recall: 0.83 fmeasure: 0.11


In [64]:
# 

type_of_article = 'business'  # entertainment, politices, sport, tech
root_path = f'../data/BBC News Summary'
num_of_article = len(os.listdir(f"{root_path}/News Articles/{type_of_article}"))

print(f'"Reading{type_of_article} articles"')
df = pd.DataFrame(columns=['title','article','summary'])

for i in tqdm(range(num_of_article)):
    with open(f'{root_path}/News Articles/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
        article = f.read() #.splitlines()
    with open(f'{root_path}/Summaries/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
        summary = f.read() #.splitlines()
    if len(summary) >=2:
        print(i)
    df.loc[i] = [article[0],article[2:],summary]

"Readingbusiness articles"


  0%|          | 2/510 [00:00<00:27, 18.78it/s]

0
1
2


  1%|          | 4/510 [00:00<00:34, 14.88it/s]

3
4


  2%|▏         | 8/510 [00:00<00:38, 12.90it/s]

5
6
7


  2%|▏         | 10/510 [00:00<00:38, 12.93it/s]

8
9
10
11


  3%|▎         | 14/510 [00:00<00:32, 15.07it/s]

12
13
14
15


  4%|▎         | 19/510 [00:01<00:36, 13.46it/s]

16
17
18


  4%|▍         | 21/510 [00:01<00:37, 13.12it/s]

19
20
21


  5%|▍         | 25/510 [00:01<00:35, 13.81it/s]

22
23
24
25


  5%|▌         | 27/510 [00:02<00:37, 12.81it/s]

26
27
28


  6%|▋         | 33/510 [00:02<00:32, 14.46it/s]

29
30
31
32


  7%|▋         | 35/510 [00:02<00:37, 12.52it/s]

33
34


  7%|▋         | 37/510 [00:02<00:39, 11.83it/s]

35
36
37


  8%|▊         | 41/510 [00:03<00:37, 12.46it/s]

38
39
40
41


  9%|▊         | 44/510 [00:03<00:31, 14.99it/s]

42
43
44


  9%|▉         | 46/510 [00:03<00:33, 13.67it/s]

45
46
47


 10%|▉         | 50/510 [00:03<00:36, 12.50it/s]

48
49
50
51

 10%|█         | 52/510 [00:03<00:33, 13.84it/s]


52
53


 11%|█         | 56/510 [00:04<00:36, 12.52it/s]

54
55
56


 12%|█▏        | 60/510 [00:04<00:37, 12.04it/s]

57
58
59


 12%|█▏        | 62/510 [00:04<00:39, 11.22it/s]

60
61
62


 13%|█▎        | 64/510 [00:04<00:37, 12.04it/s]

63
64


 13%|█▎        | 66/510 [00:05<00:45,  9.71it/s]

65
66
67


 14%|█▎        | 70/510 [00:05<00:41, 10.63it/s]

68
69


 14%|█▍        | 72/510 [00:05<00:43,  9.99it/s]

70
71


 15%|█▍        | 74/510 [00:06<00:42, 10.32it/s]

72
73


 15%|█▍        | 76/510 [00:06<00:49,  8.71it/s]

74
75
76
77


 16%|█▌        | 80/510 [00:06<00:37, 11.43it/s]

78
79
80


 16%|█▌        | 82/510 [00:06<00:37, 11.57it/s]

81
82


 17%|█▋        | 86/510 [00:07<00:39, 10.80it/s]

83
84
85


 17%|█▋        | 88/510 [00:07<00:39, 10.63it/s]

86
87
88


 18%|█▊        | 92/510 [00:07<00:36, 11.50it/s]

89
90
91
92


 18%|█▊        | 94/510 [00:07<00:34, 12.14it/s]

93
94
95


 19%|█▉        | 98/510 [00:08<00:38, 10.76it/s]

96
97


 20%|█▉        | 100/510 [00:08<00:37, 10.96it/s]

98
99
100
101


 20%|██        | 104/510 [00:08<00:40,  9.94it/s]

102
103


 21%|██        | 106/510 [00:09<00:40,  9.93it/s]

104
105
106
107


 21%|██▏       | 109/510 [00:09<00:33, 12.08it/s]

108
109
110


 22%|██▏       | 113/510 [00:09<00:37, 10.65it/s]

111
112
113

 23%|██▎       | 115/510 [00:09<00:34, 11.41it/s]


114
115
116


 24%|██▎       | 121/510 [00:10<00:26, 14.72it/s]

117
118
119
120


 24%|██▍       | 123/510 [00:10<00:28, 13.66it/s]

121
122
123

 25%|██▍       | 125/510 [00:10<00:30, 12.48it/s]


124
125

 25%|██▍       | 127/510 [00:10<00:35, 10.89it/s]


126
127

 25%|██▌       | 129/510 [00:10<00:34, 10.92it/s]


128
129


 26%|██▌       | 133/510 [00:11<00:31, 12.13it/s]

130
131
132
133


 26%|██▋       | 135/510 [00:11<00:30, 12.37it/s]

134
135
136


 27%|██▋       | 139/510 [00:11<00:31, 11.63it/s]

137
138
139


 28%|██▊       | 141/510 [00:11<00:34, 10.78it/s]

140
141
142


 28%|██▊       | 145/510 [00:12<00:33, 10.99it/s]

143
144
145


 29%|██▉       | 149/510 [00:12<00:29, 12.23it/s]

146
147
148


 30%|██▉       | 151/510 [00:12<00:27, 13.24it/s]

149
150
151
152


 30%|███       | 155/510 [00:12<00:27, 12.83it/s]

153
154
155


 31%|███       | 159/510 [00:13<00:26, 13.29it/s]

156
157
158


 32%|███▏      | 163/510 [00:13<00:25, 13.58it/s]

159
160
161
162
163


 32%|███▏      | 165/510 [00:13<00:27, 12.55it/s]

164
165
166


 33%|███▎      | 169/510 [00:14<00:27, 12.53it/s]

167
168
169
170


 34%|███▍      | 173/510 [00:14<00:25, 13.46it/s]

171
172


 34%|███▍      | 175/510 [00:14<00:24, 13.81it/s]

173
174
175


 35%|███▌      | 179/510 [00:14<00:27, 12.24it/s]

176
177
178


 35%|███▌      | 181/510 [00:15<00:27, 12.07it/s]

179
180
181


 36%|███▌      | 183/510 [00:15<00:26, 12.52it/s]

182
183
184

 37%|███▋      | 187/510 [00:15<00:24, 12.95it/s]


185
186
187


 37%|███▋      | 189/510 [00:15<00:23, 13.45it/s]

188
189


 37%|███▋      | 191/510 [00:15<00:28, 11.17it/s]

190
191
192


 38%|███▊      | 194/510 [00:16<00:27, 11.66it/s]

193
194


 39%|███▉      | 198/510 [00:16<00:25, 12.27it/s]

195
196
197
198


 40%|███▉      | 203/510 [00:16<00:20, 15.27it/s]

199
200
201
202
203


 40%|████      | 206/510 [00:16<00:18, 16.72it/s]

204
205
206
207
208


 42%|████▏     | 212/510 [00:17<00:15, 18.76it/s]

209
210
211
212
213


 43%|████▎     | 218/510 [00:17<00:13, 21.12it/s]

214
215
216
217
218


 43%|████▎     | 221/510 [00:17<00:14, 20.31it/s]

219
220
221
222


 44%|████▍     | 224/510 [00:17<00:14, 19.93it/s]

223
224
225
226


 45%|████▌     | 230/510 [00:18<00:14, 19.06it/s]

227
228
229
230
231


 46%|████▌     | 235/510 [00:18<00:14, 19.44it/s]

232
233
234
235


 47%|████▋     | 240/510 [00:18<00:12, 21.21it/s]

236
237
238
239
240
241


 48%|████▊     | 246/510 [00:18<00:13, 19.64it/s]

242
243
244
245


 49%|████▉     | 249/510 [00:19<00:13, 18.91it/s]

246
247
248
249


 50%|████▉     | 254/510 [00:19<00:12, 20.15it/s]

250
251
252
253
254


 51%|█████     | 260/510 [00:19<00:11, 21.75it/s]

255
256
257
258
259
260


 52%|█████▏    | 266/510 [00:19<00:10, 22.71it/s]

261
262
263
264
265


 53%|█████▎    | 269/510 [00:19<00:10, 22.28it/s]

266
267
268
269
270


 54%|█████▍    | 275/510 [00:20<00:10, 22.55it/s]

271
272
273
274
275


 55%|█████▌    | 281/510 [00:20<00:10, 21.97it/s]

276
277
278
279
280


 56%|█████▌    | 284/510 [00:20<00:11, 20.51it/s]

281
282
283
284

 56%|█████▋    | 287/510 [00:20<00:12, 18.46it/s]


285
286
287


 57%|█████▋    | 290/510 [00:20<00:11, 19.41it/s]

288
289
290
291


 58%|█████▊    | 294/510 [00:21<00:13, 15.76it/s]

292
293
294


 58%|█████▊    | 298/510 [00:21<00:14, 14.60it/s]

295
296
297
298


 59%|█████▉    | 302/510 [00:21<00:15, 13.75it/s]

299
300
301


 60%|█████▉    | 304/510 [00:22<00:15, 13.11it/s]

302
303
304


 60%|██████    | 308/510 [00:22<00:14, 13.62it/s]

305
306
307
308
309


 61%|██████▏   | 313/510 [00:22<00:14, 13.32it/s]

310
311
312
313


 62%|██████▏   | 315/510 [00:23<00:20,  9.72it/s]

314
315


 63%|██████▎   | 319/510 [00:23<00:18, 10.11it/s]

316
317
318
319


 63%|██████▎   | 323/510 [00:23<00:16, 11.44it/s]

320
321
322


 64%|██████▎   | 325/510 [00:23<00:16, 11.01it/s]

323
324


 64%|██████▍   | 327/510 [00:24<00:17, 10.22it/s]

325
326
327


 65%|██████▍   | 329/510 [00:24<00:17, 10.06it/s]

328
329
330


 65%|██████▌   | 333/510 [00:24<00:16, 10.61it/s]

331
332
333


 66%|██████▌   | 337/510 [00:25<00:14, 12.26it/s]

334
335
336
337


 67%|██████▋   | 341/510 [00:25<00:12, 13.56it/s]

338
339
340
341


 68%|██████▊   | 345/510 [00:25<00:14, 11.22it/s]

342
343
344


 68%|██████▊   | 347/510 [00:25<00:14, 11.49it/s]

345
346


 68%|██████▊   | 349/510 [00:26<00:15, 10.48it/s]

347
348


 69%|██████▉   | 351/510 [00:26<00:17,  9.32it/s]

349
350
351
352


 70%|██████▉   | 356/510 [00:26<00:12, 12.09it/s]

353
354
355


 71%|███████   | 360/510 [00:27<00:11, 12.56it/s]

356
357
358
359


 71%|███████   | 362/510 [00:27<00:11, 12.43it/s]

360
361
362


 72%|███████▏  | 366/510 [00:27<00:10, 14.01it/s]

363
364
365
366
367


 73%|███████▎  | 372/510 [00:27<00:08, 16.73it/s]

368
369
370
371


 73%|███████▎  | 374/510 [00:27<00:07, 17.34it/s]

372
373
374


 74%|███████▍  | 378/510 [00:28<00:09, 14.29it/s]

375
376
377


 75%|███████▍  | 380/510 [00:28<00:09, 13.69it/s]

378
379
380


 75%|███████▍  | 382/510 [00:28<00:09, 13.08it/s]

381
382
383


 76%|███████▌  | 386/510 [00:28<00:09, 12.85it/s]

384
385
386
387


 77%|███████▋  | 391/510 [00:29<00:08, 14.45it/s]

388
389
390


 77%|███████▋  | 393/510 [00:29<00:08, 13.00it/s]

391
392
393


 78%|███████▊  | 397/510 [00:29<00:08, 13.06it/s]

394
395
396


 78%|███████▊  | 399/510 [00:29<00:08, 12.88it/s]

397
398
399


 79%|███████▉  | 403/510 [00:30<00:07, 14.09it/s]

400
401
402
403


 80%|███████▉  | 407/510 [00:30<00:06, 15.22it/s]

404
405
406
407
408


 80%|████████  | 410/510 [00:30<00:06, 16.15it/s]

409
410
411


 81%|████████  | 414/510 [00:30<00:07, 12.62it/s]

412
413
414
415


 82%|████████▏ | 418/510 [00:31<00:07, 12.83it/s]

416
417
418
419


 83%|████████▎ | 422/510 [00:31<00:06, 13.08it/s]

420
421
422
423


 84%|████████▎ | 426/510 [00:31<00:06, 12.50it/s]

424
425


 84%|████████▍ | 428/510 [00:31<00:06, 12.39it/s]

426
427
428


 85%|████████▍ | 432/510 [00:32<00:06, 11.43it/s]

429
430
431


 85%|████████▌ | 434/510 [00:32<00:06, 12.38it/s]

432
433
434


 85%|████████▌ | 436/510 [00:32<00:06, 12.23it/s]

435
436
437


 86%|████████▋ | 440/510 [00:32<00:05, 13.43it/s]

438
439
440


 87%|████████▋ | 444/510 [00:33<00:05, 12.41it/s]

441
442
443


 87%|████████▋ | 446/510 [00:33<00:05, 11.78it/s]

444
445


 88%|████████▊ | 448/510 [00:33<00:05, 10.84it/s]

446
447
448


 89%|████████▊ | 452/510 [00:33<00:04, 12.99it/s]

449
450
451
452


 89%|████████▉ | 455/510 [00:34<00:03, 14.11it/s]

453
454
455


 90%|█████████ | 459/510 [00:34<00:03, 13.77it/s]

456
457
458


 90%|█████████ | 461/510 [00:34<00:03, 13.88it/s]

459
460
461


 91%|█████████ | 465/510 [00:34<00:03, 12.39it/s]

462
463
464


 92%|█████████▏| 468/510 [00:35<00:02, 14.08it/s]

465
466
467
468

 92%|█████████▏| 470/510 [00:35<00:02, 13.75it/s]


469
470
471


 93%|█████████▎| 474/510 [00:35<00:02, 14.56it/s]

472
473
474


 94%|█████████▎| 478/510 [00:35<00:02, 13.44it/s]

475
476
477
478


 95%|█████████▍| 483/510 [00:36<00:01, 16.26it/s]

479
480
481
482


 95%|█████████▌| 485/510 [00:36<00:01, 15.43it/s]

483
484
485
486


 96%|█████████▌| 489/510 [00:36<00:01, 15.81it/s]

487
488
489
490


 97%|█████████▋| 493/510 [00:36<00:01, 14.56it/s]

491
492
493
494


 97%|█████████▋| 497/510 [00:37<00:00, 13.63it/s]

495
496
497


 98%|█████████▊| 501/510 [00:37<00:00, 14.28it/s]

498
499
500
501


 99%|█████████▉| 505/510 [00:37<00:00, 16.59it/s]

502
503
504
505
506


100%|██████████| 510/510 [00:37<00:00, 13.49it/s]


507
508
509


In [60]:
df.head()

Unnamed: 0,title,article,summary
0,A,"sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December,...",T
1,D,llar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said...,T
2,Y,kos unit buyer faces loan claim\n\nThe owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back ...,Y
3,H,gh fuel prices hit BA's profits\n\nBritish Airways has blamed high fuel prices for a 40% drop in profits.\n\nReporting its results for the three m...,R
4,P,rnod takeover talk lifts Domecq\n\nShares in UK drinks and food firm Allied Domecq have risen on speculation that it could be the target of a take...,P


In [70]:
def print_article(article):
    for item in article:
        print(item)

sample_text = clean(df['article'][0])
print( sample_text)

sales boost Time Warner profit Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL. Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's ex

In [66]:
sample_summary = clean(df['summary'][0])
print(sample_summary)

TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.Time Warner's fourth quarter profits were slightly better than analysts' expectations.


In [72]:
num_summary_sentence = 6 ##
gold_standard = sample_summary
summary = ""

summary = ''.join(textrank_summary(sample_text, num_summary_sentence))
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scores = scorer.score(gold_standard, summary)
print_rouge_score(scores)
print(summary)

rouge1 Precision: 0.82 Recall: 0.70 fmeasure: 0.75
sales boost Time Warner profit Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband.Time Warner's fourth quarter profits were slightly better than analysts' expectations.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.
