In [1]:
# Necessary imports
import requests
from bs4 import BeautifulSoup as bs
import re
import nltk
import heapq

In [2]:
# Necessary downloads
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dizquierdo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dizquierdo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Article requests which returns a dictionary of url and headline
def get_articles(query, amt=0):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    soup = bs(requests.get('https://www.sciencenews.org/?s='+re.sub('\s','+',query)+
                           '&topic=&start-date=&end-date=&orderby=date', headers=headers).text,'html.parser')
    articles = {}
    for count, h in enumerate(soup.find_all('h3')):
        if not amt or count-1<amt:
            a = h.find('a')
            if a:
                articles[a.get('href')]=a.text.strip()
    summarize(articles) if amt else summarize(articles)

In [5]:
# Summarization pipeline which takes in a single url
def produce_summary(url):
    # Request given article
    article = bs(requests.get(url).text, 'html.parser')
    
    # Get a list of all sentences in the article
    sentences = [p.text for p in article.find('article').find_all('p')]
  
    # Find just the date
    for count,sentence in enumerate(sentences):
        if re.search(r'.* (am|pm)',sentence) or re.search(r'\d.*ago',sentence):
            date = sentence
            break
    
    # Find the likely end of the article
    idx = [idx for idx,sentence in enumerate(sentences) if re.search('Questions or comments?',sentence)][0]
    
    # Take just the text of the article and join it
    sents = sentences[count+1:idx]
    text = [re.sub(r'Updated .* (am|pm)','',re.sub(r'.*[A-Z]{2,}\s\—','',sent)) for sent in sents]
    joined_sentences = ' '.join(text)
    
    # Preprocess full text
    joined_sentences = re.sub(r'\(SN: [\s\d\,\.p/]+\)',' ', joined_sentences)
    formatted_text = re.sub(r'\s+', ' ', joined_sentences)
    formatted_text = re.sub('[^a-zA-Z]', ' ', formatted_text )
    formatted_text = re.sub(r'\s+', ' ', formatted_text)
    
    # Retokenize processed sentences and load standard stopwords
    sentence_list = nltk.sent_tokenize(joined_sentences)
    stopwords = nltk.corpus.stopwords.words('english')

    # Determine word frequencies for full article
    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word.lower()] = 1
            else:
                word_frequencies[word.lower()] += 1
    maximum_frequncy = max(word_frequencies.values())

    # Scale frequencies to most common non-stop word
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
        
    # Get a score for each sentence based on the word frequencies that comprise it
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    # Return sqrt(n) highest scoring sentences in descending order of weight 
    summary_sentences = heapq.nlargest(int(len(sentence_list)**(1/2)), sentence_scores, key=sentence_scores.get)
    summary_sentences.sort(key=sentence_list.index)
    summary = ' '.join(summary_sentences)
    return date, summary

In [25]:
# Article requests which returns a dictionary of url and headline
def get_articles_2(query, amt=0):
#     headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    soup = bs(requests.get('https://www.scirp.org/journal/Articles.aspx?searchCode='+re.sub('\s','+',query)+
                           '&searchField=All&page=1&SKID=58821535').text,'html.parser')
    articles = []
    for count, h in enumerate(soup.find_all('div', class_='reviewpaper')):
        article = {}
        breakpoint()
        if not amt or count-1<amt:
            link = h.next_sibling().find('a')
            if link:
                article['href']=link.get('href')
            authors = h.next_sibling().next_sibling()
            cite = {}
            for author in authors:
                cite[author.text]=author.get('href')
            article['authors']=cite
        articles.append(article)
    summarize(articles[:amt]) if amt else summarize(articles)

In [21]:
# Summarization pipeline which takes in a single url
def produce_summary_2(url):
    response={}
    # Request given article
    article = bs(requests.get(url).text, 'html.parser')
    
    # Get a list of all sentences in the article
    response['date'] = re.search(r', (.)*',article.find('div', {'id':'JournalInfor_div_nav_journal'}).find('div').find('a').text)[1]
    container = article.find('div', {'id':'JournalInfor_div_paper'})
    response['headline'] = container.find('div').find('div').next_sibling().text
    article = container.find('div',{'id':'htmlContent'})
    paragraphs = article.findAll('p', attrs={'class': None})
    
    sentences = [paragraph.text for paragraph in paragraphs]
        
    # Take just the text of the article and join it
    response['joined_sentences'] = ' '.join(sentences)
    
    # Preprocess full text
    formatted_text = re.sub(r'\s+', ' ', response['joined_sentences'])
    formatted_text = re.sub('[^a-zA-Z]', ' ', formatted_text )
    formatted_text = re.sub(r'\s+', ' ', formatted_text)
    
    # Retokenize processed sentences and load standard stopwords
    sentence_list = nltk.sent_tokenize(response['joined_sentences'])
    stopwords = nltk.corpus.stopwords.words('english')

    # Determine word frequencies for full article
    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word.lower()] = 1
            else:
                word_frequencies[word.lower()] += 1
    maximum_frequncy = max(word_frequencies.values())

    # Scale frequencies to most common non-stop word
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
        
    # Get a score for each sentence based on the word frequencies that comprise it
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    # Return sqrt(n) highest scoring sentences in descending order of weight 
    summary_sentences = heapq.nlargest(int(len(sentence_list)**(1/2)), sentence_scores, key=sentence_scores.get)
    summary_sentences.sort(key=sentence_list.index)
    response['summary'] = ' '.join(summary_sentences)
    return response

In [27]:
def summarize_2(articles):
    for article in articles:
        response = produce_summary_2(article['href'])

        print('\n'.join([response['headline'],', '.join([articles['authors'].keys(), response['date']]),re.sub(r'\n',' ',response['summary']).strip()])+'\n\n')
        

In [26]:
get_articles_2('climate change',1)

> <ipython-input-25-616f9f21c7f7>(10)get_articles_2()
-> if not amt or count-1<amt:
(Pdb) type(h)
<class 'bs4.element.Tag'>
(Pdb) l
  5  	                           '&searchField=All&page=1&SKID=58821535').text,'html.parser')
  6  	    articles = []
  7  	    for count, h in enumerate(soup.find_all('div', class_='reviewpaper')):
  8  	        article = {}
  9  	        breakpoint()
 10  ->	        if not amt or count-1<amt:
 11  	            link = h.next_sibling().find('a')
 12  	            if link:
 13  	                article['href']=link.get('href')
 14  	            authors = h.next_sibling().next_sibling()
 15  	            cite = {}
(Pdb) n
> <ipython-input-25-616f9f21c7f7>(11)get_articles_2()
-> link = h.next_sibling().find('a')
(Pdb) h

Documented commands (type help <topic>):
EOF    c          d        h         list      q        rv       undisplay
a      cl         debug    help      ll        quit     s        unt      
alias  clear      disable  ignore    longlist  r   

BdbQuit: 

In [6]:
def summarize(articles):
    for url,headline in articles.items():
        date, summary = produce_summary(url)

        print('\n'.join([headline,date,re.sub(r'\n',' ',summary).strip()])+'\n\n')

In [18]:
get_articles('women', 1)

Lab-grown organoids are more stressed-out than actual brain cells
2 hours ago
Cells in these clumps have ambiguous identities and make more stress molecules than cells taken directly from human brains, researchers reported October 22 at the annual meeting of the Society for Neuroscience. These cellular clumps are grown using stem cells made from skin or blood, which under the right conditions can be coaxed into forming three-dimensional clusters of brain cells. These clusters, a type of organoid, are thought to re-create some aspects of early human brain development, a period that is otherwise difficult to study (SN: 2/20/18). What’s more, these organoid cells didn’t fit into the neat categories of cells in actual brain tissue.




In [113]:
summarize(articles)

Algae inside blood vessels could act as oxygen factories
6 hours ago
It’s a strange mash-up, but it works: Algae living inside tadpoles’ blood vessels can pump out oxygen for nearby oxygen-starved nerve cells. I think it has great potential.” Even more futuristic possibilities include using algae in the veins of astronauts on long-haul space missions, says neurobiologist Hans Straka. “I wouldn’t call it crazy, but unconventional, let’s say.”  The researchers injected either green algae (Chlamydomonas reinhardtii) or cyanobacteria (Synechocystis) into tadpoles’ blood vessels, creating an eerie greenish animal. The algae might also be able to supply nerve cells with glucose, or even molecules that influence nerve cell behavior, he says.


Aye-ayes just got weirder with the discovery of a tiny, sixth ‘finger’
October 22, 2019 at 4:25 pm
The tiny lemurs of Madagascar, known for their large cartoonish ears and continuously growing incisor teeth, also have a sixth “finger” on each hand. Gian

Physicists have found quasiparticles that mimic hypothetical dark matter axions
October 15, 2019 at 7:00 am
Lurking within a solid crystal is a phenomenon that is mathematically similar to proposed subatomic particles called axions, physicist Johannes Gooth and colleagues report online October 7 in Nature. If axions exist as fundamental particles, they could constitute a hidden form of matter in the cosmos, dark matter. The axions analogs within the crystal are a type of quasiparticle, a disturbance in a material that can mimic fundamental particles like axions. That current grew quickly as the researchers ramped up the electric field’s strength, in a way that is a fingerprint of axion quasiparticles.


A new cooling technique relies on untwisting coiled fibers
October 10, 2019 at 2:00 pm
Called twistocaloric cooling, the method involves unwinding tightly twisted strands of various materials. The technique was used to chill water by several degrees Celsius, scientists report in the Oct

Nepal is reeling from an unprecedented dengue outbreak
As Nepal records at least 9,000 cases of dengue amid an unprecedented outbreak of the disease, workers are fumigating areas of Kathmandu against the mosquitoes that carry the disease.
The country had its first-ever dengue outbreak in 2006, but only a handful of people were affected that year from lowland districts along the southern border with India. An estimated 390 million people worldwide get dengue infections every year, with about a quarter developing symptoms, researchers said in a 2013 paper in Nature. But studies show that atmospheric temperatures are the most important drivers for dengue distribution and risk, followed by rainfall patterns, according to a 2016 review paper in Environmental Research. With climate change, “warmer temperatures can affect both the mosquito and the virus,” says coauthor Kristie Ebi, a public health expert at the University of Washington in Seattle. A hotter climate helps mosquito larvae develo