In [1]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = stopwords.words('english')

from bs4 import BeautifulSoup
from urllib.request import urlopen

from gensim.models import Phrases
from gensim.models.phrases import Phraser

import os
from collections import Counter
import string
punctuations = list(string.punctuation)
#Add some more punctuation, as the list doesn't cover all cases.
punctuations.extend(['”', '–', '``', "''"])
stop = stop + punctuations

In [24]:
url = urlopen('http://www.iflscience.com/technology/new-device-generate-electricity-anywhere-natural-temperature-changes/')
soup = BeautifulSoup(url.read().decode('utf8'), "lxml")
text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p')))

#text = text[text.find('enumerated'):]
title = soup.find('h1').text.strip()
print(title, '\n', '_' * 60, '\n', text)

New Device Can Generate Electricity Anywhere Using Natural Temperature Changes 
 ____________________________________________________________ 
 Shutterstock / Ase

Renewables are on the rise, coal is bleeding out, and natural gas is frustratingly cheap. While the energy sector fights it out for the future of the planet, researchers around the world are sneakily coming up with proof-of-concept technologies that may provide our gizmos with new sources of power further down the line.

Enter left stage, MIT’s thermal resonator. This rather magical-sounding device doesn’t need anything other than the ambient environment to generate electricity, which it does so by “harvesting” lingering thermal energy. This isn’t witchcraft, dear readers: this is the bleeding edge of engineering, and although we wouldn’t expect to own one anytime soon, it’s hard to argue that this isn’t extremely clever stuff.

Have you heard of the thermoelectric effect? It’s a neat quirk of physics that allows for the gen

In [25]:
def intersection(sent1, sent2):
    s1 = [i for i in word_tokenize(sent1) if i not in punctuations and i not in stop]
    s2 = [i for i in word_tokenize(sent2) if i not in punctuations and i not in stop]
    intersection = [i for i in s1 if i in s2]
    return len(intersection) / ((len(s1) + len(s2)) / 2)

def get_summary(text, limit=3):
    sentences = sent_tokenize(text)
    matrix = [[intersection(sentences[i], sentences[j]) for i in range(0,len(sentences))] for j in range(0,len(sentences))]
    scores = {sentences[i]: sum(matrix[i]) for i in range(len(matrix))}
    sents = sorted(scores, key=scores.__getitem__, reverse=True)[:limit]
    best_sents = [i[0] for i in sorted([(i, text.find(i)) for i in sents], key=lambda x: x[0])]
    return best_sents

def summarize(text, limit=3):
    summary = get_summary(text, limit)
    print(title)
    print()
    print(' '.join(summary))

In [27]:
summarize(text,2)

New Device Can Generate Electricity Anywhere Using Natural Temperature Changes

They’ve concocted the world's first thermal resonator, a device that they note can generate electricity based on small temperature fluctuations in the surrounding environment. This rather magical-sounding device doesn’t need anything other than the ambient environment to generate electricity, which it does so by “harvesting” lingering thermal energy.


In [7]:
def intersection(sent1, sent2):
    #As sentences are lists of tokens, there is no need to split them.
    intersection = [i for i in sent1 if i in sent2]
    return len(intersection) / ((len(sent1) + len(sent2)) / 2)

def split_sentences(sents):
    sentence_stream = [[i for i in word_tokenize(sent) if i not in stop] for sent in sents]
    bigram = Phrases(sentence_stream, min_count=2, threshold=2, delimiter=b'_')
    bigram_phraser = Phraser(bigram)
    bigram_tokens = bigram_phraser[sentence_stream]
    trigram = Phrases(bigram_tokens,min_count=2, threshold=2, delimiter=b'_')
    trigram_phraser = Phraser(trigram)
    trigram_tokens = trigram_phraser[bigram_tokens]
    return [i for i in trigram_tokens]

def get_summary(text, limit=3):
    sents = sent_tokenize(text)
    sentences = split_sentences(sents)
    matrix = [[intersection(sentences[i], sentences[j]) for i in range(0,len(sentences))] for j in range(0,len(sentences))]
    scores = {sents[i]: sum(matrix[i]) for i in range(len(matrix))}
    sents = sorted(scores, key=scores.__getitem__, reverse=True)[:limit]
    best_sents = [i[0] for i in sorted([(i, text.find(i)) for i in sents], key=lambda x: x[0])]
    return best_sents

In [9]:
summarize(text, 5)

Snap election to be held in March after Northern Ireland government collapses

"The botched renewable energy scheme is being blamed for the collapse of the devolved government but it was just the tip of the iceberg." An early election will be held in Northern Ireland on 2 March after the collapse of its government, it has been announced. Announcing the dissolution of the Northern Ireland Assembly, Mr Brokenshire urged both parties "to conduct this election with a view to...re-establishing a partnership government at the earliest opportunity after that poll." He added that the collapse of the power-sharing government was the "greatest challenge to face the Northern Ireland peace process in a decade". The break-up of the power-sharing government comes amid a dispute between Sinn Fein and the DUP over a botched renewable energy scheme that could have cost the taxpayer £500m.


### Word Frequency

In [28]:
def score_sentences(words, sentences):
    #Return scores for sentences.
    scores = Counter()
    #Words - list of words and their scores, first element is the word, second - its score.
    for word in words:
        for i in range(0, len(sentences)):
            #If word is also in title, then add double score to the sentence.
            if word[0] in sentences[i] and word[0] in title:
                scores[i] += 2 * word[1]
            elif word[0] in sentences[i]:
                scores[i] += word[1]
    sentence_scores = sorted(scores.items(), key=scores.__getitem__, reverse=True)
    return sentence_scores

def split_sentences(sents):

    sentence_stream = [[i for i in word_tokenize(sent) if i not in stop] for sent in sents]
    bigram = Phrases(sentence_stream, min_count=2, threshold=2, delimiter=b'_')
    bigram_phraser = Phraser(bigram)
    bigram_tokens = bigram_phraser[sentence_stream]
    trigram = Phrases(bigram_tokens,min_count=2, threshold=2, delimiter=b'_')
    trigram_phraser = Phraser(trigram)
    trigram_tokens = trigram_phraser[bigram_tokens]
    
    all_words = [i for j in trigram_tokens for i in j]
    frequent_words = [i for i in Counter(all_words).most_common() if i[1] > 1]
    sentences = [i for i in trigram_tokens]
    
    return frequent_words, sentences

def get_summary(text, limit=3):
    sents = sent_tokenize(text)
    frequent_words, sentences = split_sentences(sents)
    sentence_scores = score_sentences(frequent_words, sentences)
    
    limited_sents = [sents[num] for num, count in sentence_scores[:limit]]
    best_sents = [i[0] for i in sorted([(i, text.find(i)) for i in limited_sents], key=lambda x: x[0])]
    return best_sents

def summarize(text, limit=3):
    summary = get_summary(text, limit)
    print(title)
    print()
    print(' '.join(summary))

In [29]:
summarize(text, 2)

New Device Can Generate Electricity Anywhere Using Natural Temperature Changes

Enter left stage, MIT’s thermal resonator. This rather magical-sounding device doesn’t need anything other than the ambient environment to generate electricity, which it does so by “harvesting” lingering thermal energy.
