# Scrape article summaries and store them in MongoDB

### Imports

In [38]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient

### Connect to MongoDB

In [39]:
client = MongoClient()
sci = client.metis_p4_db.science

In [29]:
base = 'http://science.sciencemag.org/content/by/year/'

In [20]:
def get_issue_links(base, year_start, year_end):
    """
    Searches a site for every year and 
    """
    issues = []
    for year in range(year_start, year_end + 1):
        url = 'http://science.sciencemag.org/content/by/year/' + str(year)
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page,"html5lib")
        link_search = soup.find_all(class_ = "highlight-image-linked")
        for link in link_search:
            ref = link.get('href')
            issues.append([year, ref])
    return issues

# Have 2001-2016

In [21]:
links = get_issue_links(base, 1999, 2000)

In [22]:
def get_article_links(links):
    article = []
    path = 'http://science.sciencemag.org'
    for link in links:
        response = requests.get(path + link[1])
        page = response.text
        soup = BeautifulSoup(page,"html5lib")
        classes = ['abstract first', 'editor-summary first', 'summary first']
        for entry in classes:
            search = soup.find_all(class_ = entry)
            for item in search:
                article.append([link[0], item.find('a').get('href')])
    return article

In [23]:
articles = get_article_links(links)

In [24]:
len(articles)

4232

In [40]:
def get_article_info(articles):
    content_classes = ['section editor-summary', 'section summary', 'section abstract']
    path = 'http://science.sciencemag.org'
    i = 1
    for article in articles[404:]:
        article_info = {}
        try:
            response = requests.get(path + article[1], timeout=3)
            page = response.text
            soup = BeautifulSoup(page,"html5lib")
            topic = soup.find(class_ = 'overline').text
            topics_to_skip = ['Corrections and Clarifications', 'Technical Comments']
            title = soup.find(class_ = 'highwire-cite-title').text
            if topic in topics_to_skip:
                continue
            else:
                for entry in content_classes:
                    a = soup.find(class_ = entry)
                    if a:
                        info = a.find('p').text
            article_info['year'] = article[0]
            article_info['title'] = title
            article_info['description'] = info
            time.sleep(1)
            print(i, title)
            i += 1
            sci.insert_one(article_info)
        except:
            print('Error')

In [26]:
data = get_article_info(articles)

In [None]:
5429

# Lets get 2017

In [30]:
year = 2017
issues = []
url = 'http://science.sciencemag.org/content/by/year/' + str(year)
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page,"html5lib")
link_search = soup.find_all(class_ = "highlight-image-linked")
for link in link_search:
    ref = link.get('href')
    issues.append([year, ref])

In [32]:
print(issues[0], len(issues))

[2017, '/content/355/6320'] 44


In [33]:
articles = get_article_links(issues)

In [34]:
len(articles)

1592

In [41]:
data = get_article_info(articles)

1 Self-assembly of genetically encoded DNA-protein hybrid nanoscale shapes
2 Vertically extensive and unstable magmatic systems: A unified view of igneous processes
3 [C ii] 158-μm emission from the host galaxies of damped Lyman-alpha systems
4 Extremely efficient internal exciton dissociation through edge states in layered 2D perovskites
5 Grain boundary stability governs hardening and softening in extremely fine nanograined metals
6 Active sites for CO2 hydrogenation to methanol on Cu/ZnO catalysts
7 How “you” makes meaning
8 Dengue diversity across spatial and temporal scales: Local structure and the effect of host population size
9 Lysosomal cholesterol activates mTORC1 via an SLC38A9–Niemann-Pick C1 signaling complex
10 A conserved NAD+ binding pocket that regulates protein-protein interactions during aging
11 A macrophage relay for long-distance signaling during postembryonic tissue remodeling
12 Notch-Jagged complex structure implicates a catch bond in tuning ligand sensitivity


140 Stromal Gli2 activity coordinates a niche signaling program for mammary epithelial stem cells
141 Single-cell RNA-seq reveals new types of human blood dendritic cells, monocytes, and progenitors
142 Greater role for Atlantic inflows on sea-ice loss in the Eurasian Basin of the Arctic Ocean
143 iPTF16geu: A multiply imaged, gravitationally lensed type Ia supernova
144 A parity-breaking electronic nematic phase transition in the spin-orbit coupled metal Cd2Re2O7
145 Low-temperature activation of methane on the IrO2(110) surface
146 Quantitative 3D evolution of colloidal nanoparticle oxidation in solution
147 Fructose-driven glycolysis supports anoxia resistance in the naked mole-rat
148 Biased partitioning of the multidrug efflux pump AcrAB-TolC underlies long-lived phenotypic heterogeneity
149 Neonatal acquisition of Clostridia species protects against colonization by bacterial pathogens
150 Transgenerational transmission of environmental information in C. elegans
151 Control of mus

279 21st-century rise in anthropogenic nitrogen deposition on a remote coral reef
280 ATP as a biological hydrotrope
281 A placental growth factor is silenced in mouse embryos by the zinc finger protein ZFP568
282 Busting myths of origin
283 The pain of exile
284 Battling bias
285 Restless minds
286 Migration—the choices we face
287 New Products
288 News at a glance
289 Plot to redefine the kilogram nears climax
290 Sea trash traps face doubts
291 Paolo Macchiarini's academic afterlife in Russia ends
292 Genome writing project confronts technology hurdles
293 Island extinctions weren't inevitable
294 The next energy economy
295 Chew on this
296 Nitrogen pollution knows no bounds
297 ATP controls the crowd
298 Repulsive behavior in germinal centers
299 Multiscale measurements for materials modeling
300 Rightsizing carbon dioxide removal
301 Defining the topography of a planetary body
302 A subcellular map of the human proteome
303 Jupiter’s interior and deep atmosphere: The initial pole

423 Spooky action achieved at record distance
424 In a major shift, cancer drugs go ‘tissue-agnostic’
425 Supply of promising T cell therapy is strained
426 Chimps in waiting
427 There's more to a meal
428 Unlikely allies
429 Linking job loss, inequality, mental health, and education
430 A composite window into human history
431 Growing anisotropic crystals at the nanoscale
432 Glycophorin alleles link to malaria protection
433 Deciphering microglial diversity in Alzheimer's disease
434 Scaling pain threshold with microRNAs
435 Tracking the dynamics of electron expulsion
436 Chemical transformation of xenobiotics by the human gut microbiota
437 An environment-dependent transcriptional network specifies human microglia identity
438 A global brain state underlies C. elegans sleep behavior
439 Avian egg shape: Form, function, and evolution
440 Quantum and isotope effects in lithium metal
441 Breaking Lorentz reciprocity to overcome the time-bandwidth limit in physics and engineering
442 Q

566 A push for low-carbon fuels pays off in California
567 Surviving the cure
568 The Sun spotters
569 The elegant law that governs us all
570 When early adopters don't adopt
571 A raven's memories are for the future
572 The importance of being modular
573 Immunology taught by rats
574 Of sizzling steaks and DNA repair
575 Plasmonic imaging is gaining momentum
576 How do miniproteins fold?
577 Epigenetic plasticity and the hallmarks of cancer
578 Cash for carbon: A randomized trial of payments for ecosystem services to reduce deforestation
579 Ratchet-like polypeptide translocation mechanism of the AAA+ disaggregase Hsp104
580 Highly elastic binders integrating polyrotaxanes for silicon microparticle anodes in lithium ion batteries
581 Photoinduced decarboxylative borylation of carboxylic acids
582 Bismuthene on a SiC substrate: A candidate for a high-temperature quantum spin Hall material
583 Remobilization of crustal carbon may dominate volcanic arc emissions
584 Chiral Majorana ferm

711 A central neural circuit for itch sensation
Error
712 Vinculin forms a directionally asymmetric catch bond with F-actin
713 ELABELA deficiency promotes preeclampsia and cardiovascular malformations in mice
714 In situ architecture, function, and evolution of a contractile injection system
715 Elimination of the male reproductive tract in the female embryo is promoted by COUP-TFII in mice
716 Revisit NIH biosafety guidelines
717 New Products
718 News at a glance
719 2.7-million-year-old ice opens window on past
720 Where has all the Zika gone?
721 Australia to ax support for long-term ecology sites
722 Astrophysics missions vie for NASA money
723 U.S.-Mexico water pact aims for a greener Colorado delta
724 ‘Safe spaces’ may save the European mink
725 On the trail of yellow fever
726 Beyond the museum's mandate
Error
727 What do revised U.S. rules mean for human research?
728 Playing marble run to make methane
729 Circulating peptide prevents preeclampsia
730 Vortex generation reache

Error
Error
854 New Products
855 News at a glance
Error
856 A legacy of discovery
857 Russia heightens defenses against climate change
Error
Error
Error
Error
858 Embryo edit makes human ‘knockout’
859 China's childhood experiment
860 The legacy of the Spanish flu
861 Sleight of hand
Error
862 Toward pesticidovigilance
863 RNA localization feeds translation
864 The social origins of persistence
865 Advances in organ transplant from pigs
866 Angular momentum can slow down photoemission
Error
867 Advances in thermoelectric materials research: Looking back and moving forward
Error
Error
868 Electronic crystal growth
Error
869 Spin-imbalance in a 2D Fermi-Hubbard system
870 Frequency combs enable rapid and high-resolution multidimensional coherent spectroscopy
871 Nanophotonic rare-earth quantum memory with optically controlled retrieval
872 Changes in the microbiota cause genetically modified Anopheles to spread in a population
Error
873 Tsunami-driven rafting: Transoceanic species disper

1000 What constitutes the prefrontal cortex?
1001 Space and time in the brain
1002 What is consciousness, and could machines have it?
Error
Error
Error
Error
1003 Size effect in ion transport through angstrom-scale slits
Error
Error
Error
1004 Nε-Fatty acylation of Rho GTPases by a MARTX toxin effector
1005 Second messenger–mediated tactile response by a bacterial rotary motor
1006 Obstruction of pilus retraction stimulates bacterial surface sensing
1007 Nip misinformation in the bud
1008 New Products
Error
Error
1009 Neandertals gave ‘lost’ African DNA back to moderns
1010 ‘Base editors’ open new way to fix mutations
1011 NASA weighs trimming WFIRST to hold down costs
1012 Revamp animal research rules, report urges
1013 The electron is still round—for now
1014 Medicine's future?
Error
1015 Small beginnings
1016 Endurance: A Year in Space, A Lifetime of Discovery
1017 Assembling the brain from deep within
1018 Overriding sleep
1019 Sleep on it
1020 Ethics of maternal vaccination
1021 R

In [None]:
403