In [None]:
# Imports
import requests
from bs4 import BeautifulSoup

# Retrieve page text
url = 'https://hackaday.com/2021/04/20/a-look-at-the-most-aerodynamic-cars-ever-built/'
page = requests.get(url)

# Turn page into BeautifulSoup object to access HTML tags
soup = BeautifulSoup(page.text, 'html5lib')
TEXT = soup.find('div', {'itemprop': 'articleBody'}). get_text()

import pickle
with open('article.pkl', 'wb') as f:
    pickle.dump(TEXT, f)

In [None]:
import pickle
with open('article.pkl', 'rb') as f:
    TEXT = pickle.load(f)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
article = nlp(TEXT)
sents = list(article.sents)

In [None]:
tokens = []
for token in article:
    tokens.append(token.text)

def we_care_about(t):
    lexeme = nlp.vocab[t]
    return not lexeme.is_space and not lexeme.is_punct and not lexeme.is_stop

cool_tokens = [t for t in tokens if we_care_about(t)]

In [None]:
from collections import Counter

freqs = Counter(map(lambda x: x.lower(), cool_tokens))
N = 5
interesting_words = [x for x,_ in freqs.most_common(N)]
print(interesting_words)

In [None]:
word_weights = dict()
most_common_freq = freqs.most_common(1)[0][1]
weight_mult = 1 / most_common_freq
for word in interesting_words:
    word_weights[word] = freqs[word] * weight_mult

In [None]:
def score_sent(sent, weights):
    score = 0.0
    for word in sent:
        score += weights.get(word.text.lower(), 0.0)
    return score / len(sent)

In [None]:
sent_and_scores = []
for sent in sents:
    sent_and_scores.append((sent, score_sent(sent, word_weights)))

In [None]:
_,scores = zip(*sent_and_scores)

import matplotlib.pyplot as plt
plt.hist(scores)
plt.show()

In [None]:
SCORE_THRESH = 0.05
summary = filter(lambda sent_score: sent_score[1] >= SCORE_THRESH, sent_and_scores)
for sent, score in summary:
    print(sent.text.strip())


