In [1]:
# Necessary imports
import requests
from bs4 import BeautifulSoup as bs
import re
import nltk
import heapq

In [2]:
# Necessary downloads
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dizquierdo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dizquierdo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Article requests which returns a dictionary of url and headline
def get_articles(query):
    soup = bs(requests.get('https://www.sciencenews.org/?s='+re.sub('/s','+',query)).text,'html.parser')
    articles = {}
    for h in soup.find_all('h3'):
        a = h.find('a')
        if a:
            articles[a.get('href')]=a.text.strip()
    return articles

In [4]:
# Get a list of articles to request
articles = get_articles('climate change')

In [5]:
# Summarization pipeline which takes in a single url
def produce_summary(url):
    # Request given article
    article = bs(requests.get(url).text, 'html.parser')
    # Get a list of all sentences in the article
    sentences = [p.text for p in article.find('article').find_all('p')]
    # Find just the headline
    headline = sentences[0:3:2]
    # Find the likely end of the article
    idx = [idx for idx,sentence in enumerate(sentences) if re.search('Questions or comments?',sentence)][0]
    # Take just the text of the article and join it
    text = sentences[4:idx]
    joined_sentences = ' '.join(text)
    # Preprocess full text
    joined_sentences = re.sub(r'\(SN: [\s\d\,\.p/]+\)',' ', joined_sentences)
    formatted_text = re.sub(r'\s+', ' ', joined_sentences)
    formatted_text = re.sub('[^a-zA-Z]', ' ', formatted_text )
    formatted_text = re.sub(r'\s+', ' ', formatted_text)
    # Retokenize processed sentences and load standard stopwords
    sentence_list = nltk.sent_tokenize(joined_sentences)
    stopwords = nltk.corpus.stopwords.words('english')

    # Determine word frequencies for full article
    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    maximum_frequncy = max(word_frequencies.values())

    # Scale frequencies to most common non-stop word
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
        
    # Get a score for each sentence based on the word frequencies that comprise it
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    # Return sqrt(n) highest scoring sentences in descending order of weight 
    summary_sentences = heapq.nlargest(int(len(sentence_list)**(1/2)), sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary

In [6]:
# Print returns
print('\n\n'.join([headline+'\n'+re.sub('\n',' ',produce_summary(url)) for url,headline in articles.items()]))

Aye-ayes just got weirder with the discovery of a tiny, sixth ‘finger’
Instead, the little lemurs’ hands may have become too specialized, with thin, elongated fingers, including an especially long third digit that has a ball-and-socket joint. Giant pandas may have acquired that extra digit after the rest of their fingers became less specialized so that the bears could better walk. Then the primates bite the wood, puncturing a hole, and again use their long third finger for fishing out bugs and grubs found inside. The tiny lemurs of Madagascar, known for their large cartoonish ears and continuously growing incisor teeth, also have a sixth “finger” on each hand.

Light from outside the skull can turn on nerve cells in monkey brains
Controlling nerve cell behavior with light, a method called optogenetics, often requires thin optical fibers to be implanted in the brain  . CHICAGO — Light pulses from outside a monkey’s brain can activate nerve cells deep within. This external control, descr