# Get papers from Scientific Data journal

https://www.nature.com/sdata/

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm
import time

In [3]:
# Fist time scraping: creating separate csv files for each page
for ipage in tqdm(range(16, 214+1)): # took about 15 minutes with 2 seconds sleep
    url = f"https://www.nature.com/sdata/research-articles?searchType=journalSearch&sort=PubDate&page={ipage}"
    requests.get(url)

    soup = bs(requests.get(url).text, 'html.parser')

    #print(soup.prettify())
    # Find the relevant sections
    articles = soup.find_all('article', class_='u-full-height c-card c-card--flush')

    # List to store the extracted information
    data = []

    # Extract information for each article
    for article in articles:
        # Extract DOI
        doi_tag = article.find('a', class_='c-card__link u-link-inherit')
        doi = "10.1038/" + doi_tag['href'].split('/')[-1] if doi_tag else 'N/A'

        # Extract date
        date_tag = article.find('time')
        date = date_tag['datetime'] if date_tag else 'N/A'

        # Extract kind
        kind_tag = article.find('span', class_='c-meta__type')
        kind = kind_tag.text.strip() if kind_tag else 'N/A'

        # Extract title
        title_tag = article.find('h3', class_='c-card__title')
        title = title_tag.text.strip() if title_tag else 'N/A'

        # Extract authors
        authors_tag = article.find('ul', class_='c-author-list')
        authors = ' | '.join([author.text.strip() for author in authors_tag.find_all('li')]) if authors_tag else 'N/A'

        # Extract image URL
        image_div = article.find('div', class_='c-card__image')
        image_tag = image_div.find('img') if image_div else None
        image_url = image_tag['src'] if image_tag else 'N/A'

        # Append the extracted information to the list
        data.append({
            'DOI': doi,
            'Date': date,
            'Kind': kind,
            'Title': title,
            'Authors': authors,
            'Image URL': image_url
        })

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    df.to_csv(f'../database/page_{ipage:03d}.csv', index=False)
    time.sleep(2)
    

100%|██████████| 199/199 [11:49<00:00,  3.56s/it]


In [8]:
# merge of all csv files (that I later manually deleted)
df = pd.concat([pd.read_csv(f'../database/page_{ipage:03d}.csv') for ipage in range(1, 214+1)], ignore_index=True)
df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
print("Number of duplicates:", df.duplicated().sum())  
df = df.drop_duplicates()
df.to_csv('../database/nature_sdata.csv', index=False)
display(df)

Unnamed: 0,DOI,Date,Kind,Title,Authors,Image URL
0,10.1038/sdata20142,2014-03-11,Data Descriptor,The systematic identification of cytoskeletal ...,Alexander D Perkins | Michael J J Lee | Guy Ta...,https://media.springernature.com/w290h158/spri...
1,10.1038/sdata20141,2014-03-11,Data Descriptor,Global integrated drought monitoring and predi...,Zengchao Hao | Amir AghaKouchak | Alireza Fara...,https://media.springernature.com/w290h158/spri...
2,10.1038/sdata20142,2014-03-11,Data Descriptor,The systematic identification of cytoskeletal ...,Alexander D Perkins | Michael J J Lee | Guy Ta...,https://media.springernature.com/w290h158/spri...
3,10.1038/sdata20141,2014-03-11,Data Descriptor,Global integrated drought monitoring and predi...,Zengchao Hao | Amir AghaKouchak | Alireza Fara...,https://media.springernature.com/w290h158/spri...
4,10.1038/sdata20146,2014-05-27,Data Descriptor,microclim: Global estimates of hourly microcli...,Michael R Kearney | Andrew P Isaac | Warren P ...,https://media.springernature.com/w290h158/spri...
...,...,...,...,...,...,...
4261,10.1038/s41597-024-03657-7,2024-08-22,Data Descriptor,A high-quality chromosome-level genome assembl...,Huali Zhao | Di-an Fang | Dongpo Xu,https://media.springernature.com/w290h158/spri...
4262,10.1038/s41597-024-03786-z,2024-08-23,Data Descriptor,UK Reproducibility Network open and transparen...,Lukas Hughes-Noehrer | Noémie Aubert Bonn | An...,https://media.springernature.com/w290h158/spri...
4263,10.1038/s41597-024-03739-6,2024-08-23,Data Descriptor,MAPLES-DR: MESSIDOR Anatomical and Pathologica...,Gabriel Lepetit-Aimon | Clément Playout | Fari...,https://media.springernature.com/w290h158/spri...
4264,10.1038/s41597-024-03750-x,2024-08-23,Data Descriptor,Generating high-resolution land use and land c...,Améline Vallet | Stéphane Dupuy | Raffaele Gae...,https://media.springernature.com/w290h158/spri...


In [13]:
# download images
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    url = row['Image URL']
    if pd.isna(url):
        continue
    name = url.split('/')[-1]
    if url != 'N/A':
        with open(f'../database/images/{name}', 'wb') as f:
            f.write(requests.get(url).content)
    time.sleep(0)

  0%|          | 0/4266 [00:00<?, ?it/s]

100%|██████████| 4266/4266 [21:34<00:00,  3.30it/s]
