In [1]:
import bs4 as bs
import urllib.request

### Pulling in a wikipedia article to analyse

In [2]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing').read()

### Parsing the data/ creating BeautifulSoup object

In [4]:
soup = bs.BeautifulSoup(source,'lxml')

### Preprocessing the data

In [7]:
from functools import reduce
import re

In [8]:
text = reduce(lambda text, paragraph: text + paragraph.text, 
              soup.find_all('p'), 
              '')

text = re.sub(r'\[[0-9]*\]',' ',text) # removes [number]
text = re.sub(r'\s+',' ',text)

We need a new variable we\'re gonna use to build the histogram

In [13]:
clean_text = text.lower()
clean_text = re.sub(r'\W',' ',clean_text) # removes puntuations
clean_text = re.sub(r'\d',' ',clean_text) # removes numbers
clean_text = re.sub(r'\s+',' ',clean_text) # removes extra spaces

### Tokenize sentences and stopword list

In [15]:
import nltk

In [16]:
sentences = nltk.sent_tokenize(text)
stop_words = nltk.corpus.stopwords.words('english')

### Word counts

In [22]:
def wordCount(word2count, word, stop_words):
    if word not in stop_words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1
    
    return word2count

In [23]:
word2count = reduce(
    lambda w2c, word: wordCount(w2c, word, stop_words), 
    nltk.word_tokenize(clean_text), 
    {}
)

### Converting counts to weights

In [26]:
word2count = { word: value/max(word2count.values()) for word, value in word2count.items() }

### Scoring sentences

In [46]:
sentenceLenThreshold = 25

def calculateScore(sentence, word2count):
    score = 0
    tokenizedWords = nltk.word_tokenize(sentence.lower())
    if len(sentence.split(' ')) < sentenceLenThreshold:
        score = reduce(
            lambda acc, word: acc + (word2count[word] if word in tokenizedWords else 0),
            word2count.keys(),
            0
        )
    return score

In [47]:
sent2score = { sentence: calculateScore(sentence, word2count) for sentence in sentences }

### Gettings best 2 lines 

In [49]:
import heapq

In [50]:
best_sentences = heapq.nlargest(2, sent2score, key=sent2score.get)

In [62]:
print('1. ' + best_sentences[0] + '\n')
print('2. ' + best_sentences[1])

1. Many different classes of machine learning algorithms have been applied to natural language processing tasks.

2. Since the so-called "statistical revolution" in the late 1980s and mid 1990s, much natural language processing research has relied heavily on machine learning.
