In [2]:
import sys
import nltk
import math
import urllib.request
from bs4 import BeautifulSoup
from nltk import FreqDist
from nltk.collocations import *
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords


In [3]:
def getArticle(url,tag):
    try:
        response = urllib.request.urlopen(url)
        html = response.read().decode('utf-8')
        soup = BeautifulSoup(html,"html.parser")
    except:
        print ("Unexpected error:", sys.exc_info()[1])
        return (None,None)
    
    if (soup is None):
        return (None, None)
    
    article = ""
    
    # Finding the tag within the HTML
    if (soup.find_all(tag) is not None):
        article = ''.join(map(lambda p: p.text, soup.find_all(tag)))
        # Finding the paragraphs <p> tag
        #soup_p = BeautifulSoup(article, "html.parser")
        #if (soup_p.find_all('p') is not None):
        #    article = ''.join(map(lambda p: p.text, soup_p.find_all('p')))
            
    return article, soup.title.text

## Defining all NLP functions Needed

In [4]:
def remove_punctuation(corpus):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filtered_corpus = [token for token in corpus if (not token in punctuations)]
    return filtered_corpus

def apply_stopwording(corpus, min_len):
    filtered_corpus = [token for token in corpus if (not token in stopwords.words('english') and len(token)>min_len)]
    return filtered_corpus

def apply_stemming(corpus):
    stemmer = nltk.PorterStemmer()
    normalized_corpus = [stemmer.stem(token) for token in corpus]
    return normalized_corpus

def apply_lemmatization(corpus):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_corpus = [lemmatizer.lemmatize(token) for token in corpus]
    return normalized_corpus

def getCollocations(text, min_freq):
    bigrams = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(text)
    finder.apply_freq_filter(min_freq)
    collocations = finder.nbest(bigrams.pmi, 20)
    return collocations

def getSummary(article, min_freq):
    sentences = sent_tokenize(article)
    #tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
    tokens = nltk.word_tokenize(article.lower())
    doc = nltk.Text(tokens)
    doc_clean = nltk.Text(apply_lemmatization(apply_stopwording(remove_punctuation(doc), 3)))
    collocations = getCollocations(doc_clean,min_freq)

    summary = []
    for c in collocations:
        for sentence in sentences:
            term1 = c[0]
            term2 = c[1]
            term = c[0]+' '+c[1]
            if (term in sentence):
                #Found the sentence. Add only if not already in the summary
                if (sentence not in summary):
                    summary.append(sentence)
                
    return summary

In [5]:
url = "https://www.washingtonpost.com/opinions/on-gun-violence-we-are-a-failed-state/2018/02/18/88ecf09a-137a-11e8-9065-e55346f6de81_story.html"
tag ="article"
article, title = getArticle(url,tag)

In [5]:
summary = getSummary(article,2)
print ("# of sentences: "+str(len(summary)))
index = 1
for sentence in summary:
    print ("%s - %s" % (index,sentence))
    index+=1
    

# of sentences: 11
1 - On gun violence, the United States has become a corrupt failed state.
2 - In corrupt failed states, politics is about lying and misdirection.
3 - At the heart of our political system’s failure to address the epidemic of violence is the Republican Party’s decision to become a paid agent of the gun manufacturers’ lobby.
4 - No one wants our political system to fail more than Russian President Vladimir Putin does, and our powerlessness on guns hardly enhances our democracy’s image to the world.
5 - In no other country are the words “mental health” so empty.
6 - He told us: “We are committed to working with state and local leaders to help secure our schools and tackle the difficult issue of mental health.”  Trump’s speech, as Vox’s German Lopez observed, was “one giant lie by omission.” Those 17 people were killed by an AR-15 rifle, not by a knife or a sword or a bomb.
7 - Yes, and if Trump cared so much about mental health, he wouldn’t be proposing a $250 billion cu

In [None]:
print (article)