# Get Article page from Scintific Data Journal

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

from datetime import datetime
import time
from tqdm import tqdm

from IPython.display import display, HTML, Markdown


In [2]:
# test on a single paper
doi = "10.1038/s41597-020-00638-4" # https://www.nature.com/articles/s41597-022-01190-z

url = f"https://www.nature.com/articles/{doi.split('/')[1]}"

r = requests.get(url)
soup = bs(r.text, 'html.parser')
display(soup)

<!DOCTYPE html>

<html class="grade-c" lang="en">
<head>
<title>AiiDA 1.0, a scalable computational infrastructure for automated reproducible workflows and data provenance | Scientific Data</title>
<link href="https://www.nature.com/sdata.rss" rel="alternate" type="application/rss+xml"/>
<link crossorigin="" href="https://cmp.nature.com" rel="preconnect"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="pc,mobile" name="applicable-device"/>
<meta content="width=device-width,initial-scale=1.0,maximum-scale=5,user-scalable=yes" name="viewport"/>
<meta content="5a2dc4ab3fcb9b0393241ffbbb490480" name="360-site-verification">
<script data-test="dataLayer">
    window.dataLayer = [{"content":{"category":{"contentType":"article","legacy":{"webtrendsPrimaryArticleType":"research","webtrendsSubjectTerms":"computational-methods;research-management","webtrendsContentCategory":null,"webtrendsContentCollection":null,"webtrendsContentGroup":"Scientific Data","webtrendsContentGr

In [3]:
# initialize the metrics and abstract CSV files
FIRST_TIME = 0

if FIRST_TIME:
    df = pd.read_csv("../database/nature_sdata.csv")
    df_metrics = df[['DOI']].copy()
    df_metrics[['ParsingDate', 'Accesses', 'Citations', 'Altmetric']] = ["-", 0, 0, 0]
    df_metrics.to_csv("../database/metrics.csv", index=False)

    df_abs = df[['DOI']].copy()
    df_abs['Abstract'] = "-"
    df_abs.to_csv("../database/abstracts.csv", index=False)
    
    df1 = df[['DOI']].copy()
    df1['DataAvailability'] = "-"
    df1.to_csv("../database/data_availability.csv", index=False)
    
    df2 = df[['DOI']].copy()
    df2['CodeAvailability'] = "-"
    df2.to_csv("../database/code_availability.csv", index=False)
    
    print("Files initialized now")
else:
    print("Skipping file initilization")

Skipping file initilization


In [5]:
# First time parsing

# get all the DOIs that were not retrieved yet
doi_metrics = pd.read_csv("../database/metrics.csv").query("ParsingDate == '-'")['DOI'].tolist()
doi_abs = pd.read_csv("../database/abstracts.csv").query("Abstract == '-'")['DOI'].tolist()
doi_da = pd.read_csv("../database/data_availability.csv").query("DataAvailability == '-'")['DOI'].tolist()
doi_ca = pd.read_csv("../database/code_availability.csv").query("CodeAvailability == '-'")['DOI'].tolist()
unique_dois = list(set(doi_metrics + doi_abs + doi_da + doi_ca)) # will lose the order
print(f"Number of unique DOIs to parse: {len(unique_dois)}")

metrics = {
    'Accesses': {'data-test': 'access-count'},
    'Citations': {'data-test': 'citation-count'},
    'Altmetric': {'data-test': 'altmetric-score'}
}
    
# for each page add paper_info to metrics.csv and abstract to abstracts.csv
for doi in tqdm(unique_dois):
    url = f"https://www.nature.com/articles/{doi.split('/')[1]}"


    paper_info = {
        'DOI': doi,
        'ParsingDate': datetime.now().strftime("%Y-%m-%d")
    }
    
    r = requests.get(url)
    soup = bs(r.text, 'html.parser')

    for key, value in metrics.items():
        paper_info[key] = int(soup.find('li', value).find('p').text.split()[0].replace("k", "000")) if soup.find('li', value) else 0
        
    abstract_section = soup.find('div', {'id': 'Abs1-content'})
    abstract_text = abstract_section.get_text(separator=' ', strip=True)
    
    # TODO: this can be improve as I could parse also the references where the data/code links may be present, instead of parsing the reference as a number
    # TODO: in older articles this has a different structure, but I'm not sure that "Data Records" is the same thing
    code_availability_section = soup.find('div', {'id': 'code-availability-content'})
    code_availability_text = code_availability_section.get_text(separator=' ', strip=True) if code_availability_section else "MISSING"
    
    data_availability_section = soup.find('div', {'id': 'data-availability-content'})
    data_availability_text = data_availability_section.get_text(separator=' ', strip=True) if data_availability_section else "MISSING"

    df_metrics = pd.read_csv("../database/metrics.csv", index_col='DOI')
    df_metrics.loc[doi] = paper_info
    df_metrics.to_csv("../database/metrics.csv")
    
    df_abs = pd.read_csv("../database/abstracts.csv", index_col='DOI')
    df_abs.loc[doi, "Abstract"] = abstract_text
    df_abs.to_csv("../database/abstracts.csv")
    
    df_da = pd.read_csv("../database/data_availability.csv", index_col='DOI')
    df_da.loc[doi, "DataAvailability"] = data_availability_text
    df_da.to_csv("../database/data_availability.csv")
    
    df_ca = pd.read_csv("../database/code_availability.csv", index_col='DOI')
    df_ca.loc[doi, "CodeAvailability"] = code_availability_text
    df_ca.to_csv("../database/code_availability.csv")
    
    if False: # DEBUG
        print(f"DOI: {doi}")
        print(f"URL: {url}")
        print(f"Metrics: {paper_info}")
        display(HTML(f"<div style='background-color: black; color: white; width: 1000px;'>Abstract: {abstract_text}</div>"))
        display(HTML(f"<div style='background-color: black; color: white; width: 1000px;'>Data Availability: {data_availability_text}</div>"))
        display(HTML(f"<div style='background-color: black; color: white; width: 1000px;'>Code Availability: {code_availability_text}</div>"))
        break
    
    time.sleep(1)

Number of unique DOIs to parse: 4255


100%|██████████| 4255/4255 [2:43:39<00:00,  2.31s/it]  
