## Find abstract, title, doi, and references

In [1]:
from bs4 import BeautifulSoup

In [2]:
import requests

In [3]:
import pandas as pd
import re

In [4]:
from functions import log, isInRef

In [5]:
from urllib.parse import urlparse

In [6]:
df = pd.read_csv('links.4.csv', delimiter= '\t')

In [7]:
links = df["link"].str.split('|').map(lambda x: x[0])
journals = df["link"].str.split('|').map(lambda x: x[1])
df["link"] = links
df["journal_domain"] = journals

In [8]:
known_journals = ["journals.plos.org"]

In [9]:
# remove articles from unknown journals
df = df[df["journal_domain"].isin(known_journals)]
df

Unnamed: 0,link,journal_domain
11,https://doi.org/10.1371/journal.pone.0174944,journals.plos.org
14,https://doi.org/10.1371/journal.pone.0181142,journals.plos.org
110,https://doi.org/10.1371/journal.pone.0164270,journals.plos.org
174,https://doi.org/10.1371/journal.pone.0144916,journals.plos.org
191,https://doi.org/10.1371/journal.pone.0089157,journals.plos.org
265,https://doi.org/10.1371/journal.pone.0010271,journals.plos.org
270,https://doi.org/10.1371/journal.pcbi.1005993,journals.plos.org
271,https://doi.org/10.1371/journal.pone.0192011,journals.plos.org
285,https://doi.org/10.1371/journal.pone.0118723,journals.plos.org
287,https://doi.org/10.1371/journal.pone.0124414,journals.plos.org


In [10]:
# dictionary for getting journal name from url
journals_df = pd.read_csv('domain_to_journal.csv', delimiter='\t')
journal_ref = pd.Series(journals_df.journal.values,index=journals_df.domain).to_dict()
# dict: journal_ref['url'] = journal name

In [11]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

In [12]:
# initialize abstracts list, doi list, and citations list as well as journal name
abstracts = []
all_references = [] # nested list
dois = []
journal_names = []
titles = []

In [13]:
# make sure reference link exists 
def check_ref_plos(ref):
    try:
        link = ref.ul.li.a.get('href')
        if link != '#': # occurs if link is fake
            return link
    except AttributeError: # occurs if no link exists
        return
    
# make sure reference link exists 
def check_ref_lhub(ref):
    try:
        link = ref.ul.li.a.get('href')
        if link != '#': # occurs if link is fake
            return link
    except AttributeError: # occurs if no link exists
        return

In [20]:
def main():
    
    for index, row in df.iterrows():
        
        link = row["link"]
        
        domain = urlparse(link).netloc

        if isInRef(domain):
            journal_names.append(journal_ref[domain])
            
        page = requests.get(link, headers=headers).content
        soup = BeautifulSoup(page, 'html.parser')
        
        if row["journal_domain"] == "journals.plos.org":

            # get title
            title = soup.find('h1', id='artTitle').get_text()
            titles.append(title)

            # get abstract 
            abstract = soup.find('div', 'abstract-content').p.get_text()
            abstracts.append(abstract)

            # get doi
            doi = soup.find('li', id="artDoi").a.get_text()
            dois.append(doi)

            # get references
            # initialize reference list
            local_references = []
            references = soup.find('ol', 'references').find_all('li', id=re.compile('^ref\d+'))
            for ref in references:
                checked_ref = check_ref_plos(ref)
                if checked_ref != None:
                    local_references.append(checked_ref)
            all_references.append(local_references)
            
        elif row["journal_domain"] == "linkinghub.elsevier.com":
            
            print(soup)
            
            # get title
            title = soup.find('h1', id='screen-reader-main-title').span.get_text()
            titles.append(title)
            
            # get abstract
            abstract = soup.find('div', id='abstracts').find_all('p')
            tmp_abstract = ''
            for x in abstract:
                tmp_abstract+=x.get_text()
            abstracts.append(tmp_abstract)
            
            # get doi
            doi = soup.find('div', id='article-identifier-links').find('a', 'doi').get_text()
            dois.append(doi)
            
            # get references
            # initialize reference list
            local_references = []
            references = soup.find('dl', id='reference-links-bibs005').find_all('dd', 'reference')
            for ref in references:
                local_references.append(ref.find('div', 'ReferenceLinks u-font-sans').find('a', string='CrossRef').get('href'))
            all_references.append(local_references)
                
        
    # count references
    count = 0
    for listElem in all_references:
        count += len(listElem)

    log(f'Ran get_contents.2.ipynb succesfully, finding {count} references')
    print(count)
        


In [21]:
main()

276


In [16]:
df["title"] = titles
df["abstract"] = abstracts
df["references"] = all_references
df["doi"] = dois

In [17]:
df

Unnamed: 0,link,journal_domain,title,abstract,references,doi
11,https://doi.org/10.1371/journal.pone.0174944,journals.plos.org,Can machine-learning improve cardiovascular ri...,Current approaches to predict cardiovascular r...,"[https://doi.org/10.1136/bmj.39609.449676.25, ...",https://doi.org/10.1371/journal.pone.0174944
14,https://doi.org/10.1371/journal.pone.0181142,journals.plos.org,"""What is relevant in a text document?"": An int...",Text documents can be described by a number of...,"[https://doi.org/10.1108/eb026526, https://doi...",https://doi.org/10.1371/journal.pone.0181142
110,https://doi.org/10.1371/journal.pone.0164270,journals.plos.org,Reversible Cryopreservation of Living Cells Us...,Rapid cooling of aqueous solutions is a useful...,"[https://doi.org/10.1038/164666a0, https://doi...",https://doi.org/10.1371/journal.pone.0164270
174,https://doi.org/10.1371/journal.pone.0144916,journals.plos.org,Improving Cycling Performance: Transcranial Di...,The central nervous system seems to have an im...,[https://doi.org/10.1152/japplphysiol.91324.20...,https://doi.org/10.1371/journal.pone.0144916
191,https://doi.org/10.1371/journal.pone.0089157,journals.plos.org,Changes in Voluntary Activation Assessed by Tr...,Maximal central motor drive is known to decrea...,[],https://doi.org/10.1371/journal.pone.0089157
265,https://doi.org/10.1371/journal.pone.0010271,journals.plos.org,Do Pressures to Publish Increase Scientists' B...,The growing competition and “publish or perish...,[],https://doi.org/10.1371/journal.pone.0010271
270,https://doi.org/10.1371/journal.pcbi.1005993,journals.plos.org,Automated plant species identification—Trends ...,Current rates of species loss triggered numero...,"[https://doi.org/10.1371/journal.pbio.1001127,...",https://doi.org/10.1371/journal.pcbi.1005993
271,https://doi.org/10.1371/journal.pone.0192011,journals.plos.org,Ant genera identification using an ensemble of...,Works requiring taxonomic knowledge face sever...,"[https://doi.org/10.1098/rstb.2003.1440, https...",https://doi.org/10.1371/journal.pone.0192011
285,https://doi.org/10.1371/journal.pone.0118723,journals.plos.org,Step Detection and Activity Recognition Accura...,The aim of this study was to compare the seven...,"[https://doi.org/10.1249/MSS.0b013e3182399bc8,...",https://doi.org/10.1371/journal.pone.0118723
287,https://doi.org/10.1371/journal.pone.0124414,journals.plos.org,Feature Selection for Wearable Smartphone-Base...,"Human activity recognition (HAR), using wearab...","[https://doi.org/10.1007/s12668-013-0088-3, ht...",https://doi.org/10.1371/journal.pone.0124414


In [18]:
df.to_csv(r'results.2.csv', sep='\t', index = False)

In [19]:
len(all_references)

10