## Find abstract, title, doi, and references

In [1]:
from bs4 import BeautifulSoup

In [2]:
import requests

In [3]:
import pandas as pd
import re

In [4]:
from functions import log, isInRef

In [5]:
from urllib.parse import urlparse

In [6]:
df = pd.read_csv('links.1.csv', delimiter= '\t')

In [7]:
# dictionary for getting journal name from url
journals_df = pd.read_csv('domain_to_journal.csv', delimiter='\t')
journal_ref = pd.Series(journals_df.journal.values,index=journals_df.domain).to_dict()
# dict: journal_ref['url'] = journal name

In [8]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

In [9]:
# initialize abstracts list, doi list, and citations list as well as journal name
abstracts = []
all_references = [] # nested list
dois = []
journal_names = []
titles = []

In [10]:
# make sure reference link exists 
def check_ref(ref):
    try:
        link = ref.ul.li.a.get('href')
        if link != '#': # occurs if link is fake
            return link
    except AttributeError: # occurs if no link exists
        return

In [11]:
def main():
    for link in df.link:
        
        domain = urlparse(link).netloc
        
        if isInRef(domain):
            journal_names.append(journal_ref[domain])
                
        page = requests.get(link, headers=headers).content
        soup = BeautifulSoup(page, 'html.parser')
        
        # get title
        title = soup.find('h1', id='artTitle').get_text()
        titles.append(title)
        
        # get abstract 
        abstract = soup.find('div', 'abstract-content').p.get_text()
        abstracts.append(abstract)
        
        # get doi
        doi = soup.find('li', id="artDoi").a.get_text()
        dois.append(doi)
        
        # get references
        # initialize reference list
        local_references = []
        references = soup.find('ol', 'references').find_all('li', id=re.compile('^ref\d+'))
        for ref in references:
            checked_ref = check_ref(ref)
            if checked_ref != None:
                local_references.append(checked_ref)
        all_references.append(local_references)
        
    # count references
    count = 0
    for listElem in all_references:
        count += len(listElem)

    log(f'Ran get_contents.2.ipynb succesfully, finding {count} references')
    print(count)
        


In [12]:
main()

382


In [13]:
df["title"] = titles
df["abstract"] = abstracts
df["references"] = all_references
df["journal"] = journal_names
df["doi"] = dois

In [14]:
df

Unnamed: 0,link,title,abstract,references,journal,doi
0,https://journals.plos.org/plosone/article?id=1...,SinGAN-Seg: Synthetic training data generation...,Analyzing medical data to find abnormalities i...,"[https://doi.org/10.1136/svn-2017-000101, http...",PLOS,https://doi.org/10.1371/journal.pone.0267976
1,https://journals.plos.org/plosone/article?id=1...,Modeling transport of extended interacting obj...,We study a deterministic framework for importa...,"[https://doi.org/10.1016/j.bpj.2009.01.015, ht...",PLOS,https://doi.org/10.1371/journal.pone.0267858
2,https://journals.plos.org/plosone/article?id=1...,Using deep transfer learning to detect scolios...,Recent years have witnessed wider prevalence o...,"[https://doi.org/10.1007/s12652-019-01312-3, h...",PLOS,https://doi.org/10.1371/journal.pone.0267851
3,https://journals.plos.org/plosone/article?id=1...,Pressure vessel-oriented visual inspection met...,The detection of surface parameters of pressur...,"[https://doi.org/10.1109/ias.1995.531101, http...",PLOS,https://doi.org/10.1371/journal.pone.0267743
4,https://journals.plos.org/plosone/article?id=1...,Comparison of public discussions of gene editi...,The world’s first gene-edited babies event has...,"[https://doi.org/10.1038/d41586-020-02765-9, h...",PLOS,https://doi.org/10.1371/journal.pone.0267406
5,https://journals.plos.org/plosone/article?id=1...,Software reliability model of open source soft...,Open source software (OSS) has become one of t...,"[https://doi.org/10.1002/9781119821779.ch5, ht...",PLOS,https://doi.org/10.1371/journal.pone.0267171
6,https://journals.plos.org/plosone/article?id=1...,Study of Asian indexes by a newly derived dyna...,We take the stock prices as a dynamic system a...,"[https://doi.org/10.2307/1907042, https://doi....",PLOS,https://doi.org/10.1371/journal.pone.0266600
7,https://journals.plos.org/plosone/article?id=1...,"Awareness, perception and perpetration of cybe...","The modern online society requires everyone, e...",[https://doi.org/10.1111/j.1746-1561.2008.0033...,PLOS,https://doi.org/10.1371/journal.pone.0267702
8,https://journals.plos.org/plosone/article?id=1...,Validation pipeline for machine learning algor...,A standardized objective evaluation method is ...,"[https://doi.org/10.1109/TMI.2014.2377694, htt...",PLOS,https://doi.org/10.1371/journal.pone.0267213
9,https://journals.plos.org/plosone/article?id=1...,Identifying luminal and basal mammary cell spe...,Mammary gland is present in all mammals and us...,"[https://doi.org/10.1002/wdev.35, https://doi....",PLOS,https://doi.org/10.1371/journal.pone.0267211


In [15]:
df.to_csv(r'results.2.csv', sep='\t', index = False)

In [16]:
len(all_references)

13