In [None]:
## Automatic Summarization

The Reuters Corpus contains nearly 11,000 news articles about a variety of topics and subjects.
If you've run the <code>nltk.download()</code> command as previously recommended, you can then 
easily import and explore the Reuters Corpus like so:

In [None]:
from nltk.corpus import reuters

print ('** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[0])[:500] + ' [...]\"')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

token_dict = {}
for article in reuters.fileids():
    token_dict[article] = reuters.raw(article)
        
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(token_dict.values())
tdm = tfidf.transform(token_dict.values())

In [None]:
from random import randint

feature_names = tfidf.get_feature_names()
print ('TDM contains ' + str(len(feature_names)) + ' terms and ' + str(tdm.shape[0]) + ' documents')

print ('first term: ' + feature_names[0])
print ('last term: ' + feature_names[len(feature_names) - 1])

for i in range(0, 4):
    print ('random term: ' + feature_names[randint(1,len(feature_names) - 2)])

In [None]:
import math
from __future__ import division

article_id = randint(0, tdm.shape[0] - 1)
article_text = reuters.raw(reuters.fileids()[article_id])
print ('\n*** ORIGINAL ***')
print (article_text)

In [None]:
sent_scores = []
for sentence in nltk.sent_tokenize(article_text):
    score = 0
    sent_tokens = tokens = [word for sent in nltk.sent_tokenize(sentence) for word in nltk.word_tokenize(sentence)]
    for token in (t for t in sent_tokens if t in feature_names):
        score += tdm[article_id, feature_names.index(token)]
    sent_scores.append((score / len(sent_tokens), sentence))


In [None]:
summary_length = int(math.ceil(len(sent_scores) / 5))
sent_scores.sort(key=lambda sent: sent[0], reverse=True)

print ('*** SUMMARY ***')
for summary_sentence in sent_scores[:summary_length]:
    print (summary_sentence[1])