In [1]:
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
import json
import pandas as pd

INFO:paperscraper.load_dumps:Loaded biorxiv dump with 367505 entries
INFO:paperscraper.load_dumps:Loaded medrxiv dump with 78791 entries


In [None]:
medrxiv()

In [None]:
biorxiv()

In [16]:
import json
import pandas as pd

med_file_path = '/nfs/users/nfs_b/bc13/anaconda3/envs/juniper/lib/python3.10/site-packages/paperscraper/server_dumps/medrxiv_2025-02-02.jsonl'

medpapers = []
with open(med_file_path, 'r') as f:
    for line in f:
        meddata = json.loads(line)
        medpapers.append(meddata)

med_df = pd.DataFrame(medpapers)
med_df.drop_duplicates(subset=['doi'], inplace=True) # Keep only the first occurrence of each DOI

print(med_df.head())
print(len(med_df))

                                               title               doi  \
0  Molecular profiling of neonatal dried blood sp...  10.1101/19000109   
5  Crohns disease and ulcerative colitis patient ...  10.1101/19000273   
6  Updating Insights into Rosiglitazone and Cardi...  10.1101/19000463   
7  Predicting epileptic seizures using nonnegativ...  10.1101/19000430   
8  Prospective and External Evaluation of a Machi...  10.1101/19000133   

                                             authors  \
0  Daniel Costa; Nuria Bonet; Amanda Sole; Jose M...   
5  Orna G Ehrlich; James Testaverde; Caren Heller...   
6  Joshua D Wallach; Kun Wang; Audrey D Zhang; De...   
7                    Olivera Stojanovic; Gordon Pipa   
8  Nathan Brajer; Brian Cozzi; Michael Gao; Mike ...   

                                            abstract        date journal  
0  The fetal inflammatory response (FIR) increase...  2019-06-25          
5  BackgroundClinical trial recruitment is often ...  2019-06-25    

In [8]:
bio_file_path = '/nfs/users/nfs_b/bc13/anaconda3/envs/juniper/lib/python3.10/site-packages/paperscraper/server_dumps/biorxiv_2025-02-02.jsonl'

biopapers = []
with open(bio_file_path, 'r') as f:
    for line in f:
        biodata = json.loads(line)
        biopapers.append(biodata)

bio_df = pd.DataFrame(biopapers)
bio_df.drop_duplicates(subset=['doi'], inplace=True) # Keep only the first occurrence of each DOI

print(bio_df.head())
print(len(bio_df))

                                               title             doi  \
0  Population genomics of Saccharomyces cerevisia...  10.1101/001891   
1  Estimating seed bank accumulation and dynamics...  10.1101/001867   
2  How and where to look for tRNAs in Metazoan mi...  10.1101/001875   
4  Tracking global changes induced in the CD4 T c...  10.1101/001883   
5  The shrinking human protein coding complement:...  10.1101/001909   

                                             authors  \
0  Carlotta De Filippo;Monica Di Paola;Irene Stef...   
1    Meaghan E. Jenkins;David Morrison;Tony D. Auld;   
2                                    David Morrison;   
4  Niclas Thomas;Katharine Best;Mattia Cinelli;Sh...   
5  Iakes Ezkurdia;David Juan;Jose Manuel Rodrigue...   

                                            abstract        date journal  
0  The quest for the ecological niches of Sacchar...  2014-01-17          
1  The seed bank dynamics of the three co-occurri...  2014-01-17          
2  Th

In [9]:
bio_head = bio_df.head()

In [12]:
import requests
import json
from urllib.parse import quote
import time
from tqdm import tqdm
import pandas as pd

def fetch_reference_count(doi):
    """
    Fetches the 'is-referenced-by-count' for a given DOI from the Crossref API.

    Args:
        doi (str): The DOI string to look up.

    Returns:
        int or None: The references count if successful, None if there was an error.
    """
    base_url = "https://api.crossref.org/works/"
    # URL encode the DOI to handle special characters
    encoded_doi = quote(doi)
    api_url = base_url + encoded_doi

    headers = {
        'User-Agent': 'DOIReferenceCounter/1.0 (https://example.org/DOIReferenceCounter; mailto:your-email@example.org)' # Replace with your info
    }

    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)

        data = response.json()

        if data['status'] == 'ok':
            message = data['message']
            if 'is-referenced-by-count' in message:
                return message['is-referenced-by-count']
            else:
                print(f"Warning: 'is-referenced-by-count' not found in the response for DOI: {doi}")
                return None
        else:
            print(f"Error: API request was not 'ok' for DOI: {doi}. Status: {data['status']}")
            return None

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for DOI: {doi}: {http_err}")
        if response.status_code == 404:
            print(f"  DOI not found: {doi}") # Specific message for 404
        elif response.status_code == 429:
            print(f"  Rate limit exceeded (429) for DOI: {doi}. You might be making requests too fast.")
        return None
    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred for DOI: {doi}: {req_err}")
        return None
    except json.JSONDecodeError as json_err:
        print(f"JSON decode error for DOI: {doi}: {json_err}. Response text: {response.text}")
        return None
    except KeyError as key_err:
        print(f"Key error accessing JSON for DOI: {doi}: {key_err}.  Check the API response structure.")
        return None


def add_citation_counts_to_df(df):
    """
    Fetches citation counts for DOIs in the input dataframe and adds them as a new column.
    This function now operates on a copy of the input DataFrame to avoid modifying the original.

    Args:
        df (pd.DataFrame): DataFrame containing a 'doi' column.

    Returns:
        pd.DataFrame: A *copy* of the DataFrame with an added 'citation_number' column containing citation counts.
    """
    df_copy = df.copy() # Create a copy of the input DataFrame

    # Get DOI list from the dataframe
    doi_list = df_copy["doi"].unique().tolist() # Use unique DOIs to avoid redundant requests if DOIs are repeated

    # Initialize a new column in the copied dataframe to store citation counts
    df_copy['citation_number'] = None

    print("Fetching reference counts for DOIs and adding to DataFrame (aiming for ~50 requests/second):")

    request_count = 0
    start_time = time.time()

    for index, row in tqdm(df_copy.iterrows(), total=len(df_copy), desc="Processing DOIs"): # Wrap with tqdm
        doi = row["doi"]
        reference_count = fetch_reference_count(doi) # Assuming fetch_reference_count is defined elsewhere

        # Add the citation count to the 'citation_number' column in the copied DataFrame
        df_copy.loc[index, 'citation_number'] = reference_count

        request_count += 1
        elapsed_time = time.time() - start_time

        if request_count >= 50: # Check if we've made 50 requests
            if elapsed_time < 1: # If less than 1 second has passed, sleep for the remaining time
                sleep_duration = 1 - elapsed_time
                if sleep_duration > 0: # Ensure sleep duration is not negative
                    time.sleep(sleep_duration)
            start_time = time.time() # Reset the start time for the next second
            request_count = 0       # Reset the request count for the next second

    return df_copy

In [17]:
final_med_df = add_citation_counts_to_df(med_df)
final_med_df.to_csv("medrxiv_citations.csv", index=False)

final_med_df
print(f"medRxiv DataFrame with citation counts saved as CSV.")

Fetching reference counts for DOIs and adding to DataFrame (aiming for ~50 requests/second):


Processing DOIs:   8%|▊         | 4881/63018 [25:36<5:04:22,  3.18it/s] 

HTTP error occurred for DOI: 10.1101/2020.05.09.20096354: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2020.05.09.20096354
  DOI not found: 10.1101/2020.05.09.20096354


Processing DOIs:   8%|▊         | 5267/63018 [27:37<4:58:26,  3.23it/s]

HTTP error occurred for DOI: 10.1101/2020.05.14.20099234: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2020.05.14.20099234
  DOI not found: 10.1101/2020.05.14.20099234


Processing DOIs:   8%|▊         | 5268/63018 [27:37<4:56:29,  3.25it/s]

HTTP error occurred for DOI: 10.1101/2020.05.13.20101113: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2020.05.13.20101113
  DOI not found: 10.1101/2020.05.13.20101113


Processing DOIs:  23%|██▎       | 14514/63018 [1:15:42<4:06:01,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2020.12.11.20245035: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2020.12.11.20245035
  DOI not found: 10.1101/2020.12.11.20245035


Processing DOIs:  23%|██▎       | 14577/63018 [1:16:01<5:00:15,  2.69it/s]

HTTP error occurred for DOI: 10.1101/2020.12.15.20248167: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2020.12.15.20248167
  DOI not found: 10.1101/2020.12.15.20248167


Processing DOIs:  36%|███▌      | 22563/63018 [1:57:34<3:24:55,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2021.07.21.21260881: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.07.21.21260881
  DOI not found: 10.1101/2021.07.21.21260881


Processing DOIs:  36%|███▌      | 22564/63018 [1:57:34<3:23:16,  3.32it/s]

HTTP error occurred for DOI: 10.1101/2021.07.18.21260738: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.07.18.21260738
  DOI not found: 10.1101/2021.07.18.21260738


Processing DOIs:  36%|███▌      | 22570/63018 [1:57:36<3:24:41,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2021.07.21.21260103: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.07.21.21260103
  DOI not found: 10.1101/2021.07.21.21260103


Processing DOIs:  36%|███▌      | 22575/63018 [1:57:37<3:25:11,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2021.07.17.21260620: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.07.17.21260620
  DOI not found: 10.1101/2021.07.17.21260620


Processing DOIs:  36%|███▌      | 22585/63018 [1:57:40<3:26:16,  3.27it/s]

HTTP error occurred for DOI: 10.1101/2021.07.20.21260850: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.07.20.21260850
  DOI not found: 10.1101/2021.07.20.21260850


Processing DOIs:  41%|████      | 25532/63018 [2:12:58<3:11:23,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2021.10.18.21264728: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.10.18.21264728
  DOI not found: 10.1101/2021.10.18.21264728


Processing DOIs:  43%|████▎     | 26979/63018 [2:20:30<3:03:42,  3.27it/s]

HTTP error occurred for DOI: 10.1101/2021.12.10.21267608: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2021.12.10.21267608
  DOI not found: 10.1101/2021.12.10.21267608


Processing DOIs:  48%|████▊     | 30266/63018 [2:37:36<2:47:11,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2022.03.17.22272451: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.03.17.22272451
  DOI not found: 10.1101/2022.03.17.22272451


Processing DOIs:  50%|████▉     | 31236/63018 [2:42:40<2:42:37,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2022.04.18.22273977: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.04.18.22273977
  DOI not found: 10.1101/2022.04.18.22273977


Processing DOIs:  50%|████▉     | 31386/63018 [2:43:27<2:52:51,  3.05it/s]

HTTP error occurred for DOI: 10.1101/2022.04.22.22274171: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.04.22.22274171
  DOI not found: 10.1101/2022.04.22.22274171


Processing DOIs:  50%|████▉     | 31391/63018 [2:43:29<2:42:03,  3.25it/s]

HTTP error occurred for DOI: 10.1101/2022.04.22.22274153: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.04.22.22274153
  DOI not found: 10.1101/2022.04.22.22274153


Processing DOIs:  50%|█████     | 31787/63018 [2:45:31<2:39:22,  3.27it/s]

HTTP error occurred for DOI: 10.1101/2022.05.10.22274894: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.05.10.22274894
  DOI not found: 10.1101/2022.05.10.22274894


Processing DOIs:  51%|█████     | 31974/63018 [2:46:29<2:37:40,  3.28it/s]

HTTP error occurred for DOI: 10.1101/2022.05.10.22274920: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.05.10.22274920
  DOI not found: 10.1101/2022.05.10.22274920


Processing DOIs:  51%|█████     | 32132/63018 [2:47:18<2:36:15,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2022.05.18.22275239: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.05.18.22275239
  DOI not found: 10.1101/2022.05.18.22275239


Processing DOIs:  52%|█████▏    | 32701/63018 [2:50:15<2:33:30,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2022.06.07.22276118: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.06.07.22276118
  DOI not found: 10.1101/2022.06.07.22276118


Processing DOIs:  53%|█████▎    | 33400/63018 [2:53:54<2:31:12,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2022.06.30.22277095: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.06.30.22277095
  DOI not found: 10.1101/2022.06.30.22277095


Processing DOIs:  54%|█████▎    | 33733/63018 [2:55:37<2:27:30,  3.31it/s]

HTTP error occurred for DOI: 10.1101/2022.07.12.22277543: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2022.07.12.22277543
  DOI not found: 10.1101/2022.07.12.22277543


Processing DOIs:  61%|██████    | 38486/63018 [3:20:48<2:10:40,  3.13it/s]

HTTP error occurred for DOI: 10.1101/2023.01.17.23284661: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.01.17.23284661
  DOI not found: 10.1101/2023.01.17.23284661


Processing DOIs:  64%|██████▎   | 40091/63018 [3:29:33<1:57:22,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2023.03.09.23287075: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.09.23287075
  DOI not found: 10.1101/2023.03.09.23287075


Processing DOIs:  64%|██████▎   | 40123/63018 [3:29:44<2:01:58,  3.13it/s]

HTTP error occurred for DOI: 10.1101/2023.03.13.23287212: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.13.23287212
  DOI not found: 10.1101/2023.03.13.23287212


Processing DOIs:  64%|██████▎   | 40136/63018 [3:29:48<2:02:53,  3.10it/s]

HTTP error occurred for DOI: 10.1101/2023.03.13.23287203: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.13.23287203
  DOI not found: 10.1101/2023.03.13.23287203


Processing DOIs:  64%|██████▍   | 40347/63018 [3:30:58<2:04:41,  3.03it/s]

HTTP error occurred for DOI: 10.1101/2023.03.17.23287218: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.17.23287218
  DOI not found: 10.1101/2023.03.17.23287218


Processing DOIs:  64%|██████▍   | 40357/63018 [3:31:01<2:06:11,  2.99it/s]

HTTP error occurred for DOI: 10.1101/2023.03.19.23287358: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.19.23287358
  DOI not found: 10.1101/2023.03.19.23287358


Processing DOIs:  64%|██████▍   | 40378/63018 [3:31:08<2:00:01,  3.14it/s]

HTTP error occurred for DOI: 10.1101/2023.03.21.23287536: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.21.23287536
  DOI not found: 10.1101/2023.03.21.23287536


Processing DOIs:  64%|██████▍   | 40404/63018 [3:31:17<2:21:08,  2.67it/s]

HTTP error occurred for DOI: 10.1101/2023.03.22.23287587: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.22.23287587
  DOI not found: 10.1101/2023.03.22.23287587


Processing DOIs:  64%|██████▍   | 40431/63018 [3:31:26<1:57:57,  3.19it/s]

HTTP error occurred for DOI: 10.1101/2023.03.23.23287637: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.23.23287637
  DOI not found: 10.1101/2023.03.23.23287637


Processing DOIs:  64%|██████▍   | 40447/63018 [3:31:31<2:01:03,  3.11it/s]

HTTP error occurred for DOI: 10.1101/2023.03.23.23287628: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.23.23287628
  DOI not found: 10.1101/2023.03.23.23287628


Processing DOIs:  64%|██████▍   | 40493/63018 [3:31:46<2:03:44,  3.03it/s]

HTTP error occurred for DOI: 10.1101/2023.03.27.23287778: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.03.27.23287778
  DOI not found: 10.1101/2023.03.27.23287778


Processing DOIs:  72%|███████▏  | 45119/63018 [3:56:06<1:30:37,  3.29it/s]

HTTP error occurred for DOI: 10.1101/2023.08.26.23294679: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.08.26.23294679
  DOI not found: 10.1101/2023.08.26.23294679


Processing DOIs:  76%|███████▌  | 47843/63018 [4:10:20<1:22:52,  3.05it/s]

HTTP error occurred for DOI: 10.1101/2023.11.18.23298724: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.11.18.23298724
  DOI not found: 10.1101/2023.11.18.23298724


Processing DOIs:  78%|███████▊  | 49056/63018 [4:16:40<1:12:53,  3.19it/s]

HTTP error occurred for DOI: 10.1101/2023.12.28.23300552: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2023.12.28.23300552
  DOI not found: 10.1101/2023.12.28.23300552


Processing DOIs:  82%|████████▏ | 51960/63018 [4:31:47<56:30,  3.26it/s]  

HTTP error occurred for DOI: 10.1101/2024.03.22.24304763: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.03.22.24304763
  DOI not found: 10.1101/2024.03.22.24304763


Processing DOIs:  82%|████████▏ | 51986/63018 [4:31:55<59:04,  3.11it/s]

HTTP error occurred for DOI: 10.1101/2024.03.26.24304926: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.03.26.24304926
  DOI not found: 10.1101/2024.03.26.24304926


Processing DOIs:  83%|████████▎ | 52609/63018 [4:35:10<53:15,  3.26it/s]  

HTTP error occurred for DOI: 10.1101/2024.04.15.24305820: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.04.15.24305820
  DOI not found: 10.1101/2024.04.15.24305820


Processing DOIs:  86%|████████▌ | 53983/63018 [4:42:21<49:29,  3.04it/s]  

HTTP error occurred for DOI: 10.1101/2024.05.23.24307808: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.05.23.24307808
  DOI not found: 10.1101/2024.05.23.24307808


Processing DOIs:  86%|████████▌ | 54106/63018 [4:43:00<45:29,  3.27it/s]

HTTP error occurred for DOI: 10.1101/2024.05.24.24307909: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.05.24.24307909
  DOI not found: 10.1101/2024.05.24.24307909


Processing DOIs:  88%|████████▊ | 55620/63018 [4:50:52<37:53,  3.25it/s]  

HTTP error occurred for DOI: 10.1101/2024.07.10.24310215: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.07.10.24310215
  DOI not found: 10.1101/2024.07.10.24310215


Processing DOIs:  90%|█████████ | 57002/63018 [4:58:06<30:57,  3.24it/s]  

HTTP error occurred for DOI: 10.1101/2024.08.20.24312307: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.08.20.24312307
  DOI not found: 10.1101/2024.08.20.24312307


Processing DOIs:  90%|█████████ | 57018/63018 [4:58:11<30:53,  3.24it/s]

HTTP error occurred for DOI: 10.1101/2024.08.21.24312350: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.08.21.24312350
  DOI not found: 10.1101/2024.08.21.24312350


Processing DOIs:  91%|█████████ | 57034/63018 [4:58:16<31:05,  3.21it/s]

HTTP error occurred for DOI: 10.1101/2024.08.20.24312284: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.08.20.24312284
  DOI not found: 10.1101/2024.08.20.24312284


Processing DOIs:  92%|█████████▏| 57960/63018 [5:03:05<25:54,  3.25it/s]  

HTTP error occurred for DOI: 10.1101/2024.09.15.24313716: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.09.15.24313716
  DOI not found: 10.1101/2024.09.15.24313716


Processing DOIs:  92%|█████████▏| 57971/63018 [5:03:08<25:52,  3.25it/s]

HTTP error occurred for DOI: 10.1101/2024.09.16.24313749: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.09.16.24313749
  DOI not found: 10.1101/2024.09.16.24313749


Processing DOIs:  92%|█████████▏| 57977/63018 [5:03:10<25:51,  3.25it/s]

HTTP error occurred for DOI: 10.1101/2024.09.16.24312391: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.09.16.24312391
  DOI not found: 10.1101/2024.09.16.24312391


Processing DOIs:  93%|█████████▎| 58865/63018 [5:07:47<21:09,  3.27it/s]

HTTP error occurred for DOI: 10.1101/2024.10.07.24314689: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.10.07.24314689
  DOI not found: 10.1101/2024.10.07.24314689


Processing DOIs:  97%|█████████▋| 61344/63018 [5:20:39<08:27,  3.30it/s]

HTTP error occurred for DOI: 10.1101/2024.12.10.24318751: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.12.10.24318751
  DOI not found: 10.1101/2024.12.10.24318751


Processing DOIs:  98%|█████████▊| 61510/63018 [5:21:31<07:41,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2024.12.16.24319076: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.12.16.24319076
  DOI not found: 10.1101/2024.12.16.24319076


Processing DOIs:  98%|█████████▊| 61587/63018 [5:21:54<07:19,  3.26it/s]

HTTP error occurred for DOI: 10.1101/2024.12.17.24319140: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1101/2024.12.17.24319140
  DOI not found: 10.1101/2024.12.17.24319140


Processing DOIs: 100%|██████████| 63018/63018 [5:29:20<00:00,  3.19it/s]


medRxiv DataFrame with citation counts saved as CSV.


In [None]:
final_bio_df = add_citation_counts_to_df(bio_df)
final_bio_df.to_csv("biorxiv_citations.csv", index=False)

final_bio_df
print(f"bioRxiv DataFrame with citation counts saved as CSV.")

In [2]:
import pandas as pd

df = pd.read_csv("medrxiv_citations.csv")
df = df.dropna(subset=["citation_number"])
df["citation_number"] = pd.to_numeric(df["citation_number"], errors="coerce")
df = df.dropna(subset=["citation_number"])
df["citation_number"] = df["citation_number"].astype(int)
df.to_csv("medrxiv_data.csv", index=False)

print("Rows with empty or non-numeric 'citation_number' have been removed.")

Rows with empty or non-numeric 'citation_number' have been removed.


In [4]:
df = pd.read_csv("biorxiv_data.csv")
df = df.dropna(subset=["citation_number"])
df["citation_number"] = pd.to_numeric(df["citation_number"], errors="coerce")
df = df.dropna(subset=["citation_number"])
df["citation_number"] = df["citation_number"].astype(int)
df.to_csv("biorxiv_data.csv", index=False)

print("Rows with empty or non-numeric 'citation_number' have been removed.")

Rows with empty or non-numeric 'citation_number' have been removed.
Rows with empty or non-numeric 'citation_number' have been removed.
Rows with empty or non-numeric 'citation_number' have been removed.
Rows with empty or non-numeric 'citation_number' have been removed.


In [6]:
df.head()

Unnamed: 0,title,doi,authors,abstract,date,journal,citation_number
0,Cannabidiol administration reduces the express...,10.1101/2023.07.10.548420,"Machado, J. P.; Almeida, V.; Zuardi, A. W.; Ha...","BackgroundCannabidiol (CBD), one of the main c...",2023-07-11,,1.0
1,Comammox bacterial preference for urea influen...,10.1101/2023.07.11.548560,"Vilardi, K. J.; Johnston, J.; Dai, Z.; Cotto, ...",While the co-existence of comammox bacteria wi...,2023-07-11,,0.0
2,The distribution of fitness effects of plasmid...,10.1101/2023.07.11.548518,"Fernandez-Calvet, A.; Toribio-Celestino, L.; A...",Antimicrobial resistance (AMR) in bacteria is ...,2023-07-11,,0.0
3,Emiliania huxleyi virus arrests host calcifica...,10.1101/2023.07.11.548577,"Dikstein, T.; Antler, G.; Pellerin, A.; Sharon...",Blooms of the coccolithophore Emiliania huxley...,2023-07-11,,0.0
4,Amyloid β induces cardiac dysfunction and neur...,10.1101/2023.07.11.548558,"Elia, A.; Parodi-Rullan, R. M.; Vazquez-Torres...",Aims: Alzheimers disease (AD) is a complex neu...,2023-07-11,,0.0


In [7]:
len(df)

66608

In [8]:
df = pd.read_csv("biorxiv_data.csv")
len(df)

266432

In [9]:
dates = df["date"]

In [10]:
df['year'] = df['date'].astype(str).str[:4]
papers_per_year = df['year'].value_counts().sort_index()
print(papers_per_year)

2013      109
2014      886
2015     1773
2016     4718
2017    11338
2018    20778
2019    29179
2020    38711
2021    36819
2022    35742
2023    39146
2024    43431
2025     3802
Name: year, dtype: int64


In [11]:
import requests
import pandas as pd
import time

dois = df['doi'].sample(n=100, random_state=42)  # Use random_state for reproducibility

# API Base URL
API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# Function to check DOI in PubMed
def check_doi_in_pubmed(doi):
    params = {
        "db": "pubmed",
        "term": f"{doi}[DOI]",
        "retmode": "json"
    }
    response = requests.get(API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        pmids = data.get("esearchresult", {}).get("idlist", [])
        return pmids[0] if pmids else None
    return None

# Process all DOIs
results = []
for i, doi in enumerate(dois):
    pmid = check_doi_in_pubmed(doi)
    results.append({"DOI": doi, "PMID": pmid})
    
    # Print progress every 1000 DOIs
    if (i + 1) % 1000 == 0:
        print(f"Checked {i + 1}/{len(dois)} DOIs...")
    
    # Respect NCBI rate limits
    time.sleep(0.3)  # Adjust if necessary

# Save results to a CSV file
pmid_df = pd.DataFrame(results)

In [15]:
import pandas as pd

# Read the CSV files
df_biorxiv = pd.read_csv('biorxiv_data.csv')
df_medrxiv = pd.read_csv('medrxiv_data.csv')

# Concatenate the DataFrames
df_combined = pd.concat([df_biorxiv, df_medrxiv], ignore_index=True)

# Save the concatenated DataFrame to a new CSV file
df_combined.to_csv('biomedrxiv_data.csv', index=False)
print("Combined CSV file saved as 'biomedrxiv_data.csv'.")

# Ensure the 'date' column is in datetime format
df_combined['date'] = pd.to_datetime(df_combined['date'], errors='coerce')

# Define the cutoff date (after June 2020 means dates > June 30, 2020)
cutoff_date = pd.to_datetime('2020-06-01')

# Filter the DataFrame for rows with dates after the cutoff
df_filtered = df_combined[df_combined['date'] > cutoff_date]

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_biomedrxiv_data.csv', index=False)
print("Filtered CSV file saved as 'filtered_biomedrxiv_data.csv'.")

Combined CSV file saved as 'biomedrxiv_data.csv'.
Filtered CSV file saved as 'filtered_biomedrxiv_data.csv'.


In [16]:
df_biomedrxiv = pd.read_csv('filtered_biomedrxiv_data.csv')

In [18]:
import time
import requests
import pandas as pd
import nest_asyncio
nest_asyncio.apply()  # Patch the running event loop to allow nested use
import aiohttp         # <-- Make sure this is imported

from aiolimiter import AsyncLimiter
import asyncio
from tqdm import tqdm  # regular tqdm for synchronous iteration
from tqdm.asyncio import tqdm_asyncio  # provides asynchronous progress bars

# API Key (Replace with your actual key)
API_KEY = "309f14f8a79a4cb31832fe44e6f991fcaa09"

# API Base URL
API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# List of DOIs to check
dois = df_biomedrxiv["doi"].tolist()

def check_doi_in_pubmed(doi):
    params = {
        "db": "pubmed",
        "term": f"{doi}[DOI]",
        "retmode": "json",
        "api_key": API_KEY  # Use API key for faster lookups
    }
    retries = 3  # Maximum number of retries per DOI
    while retries > 0:
        try:
            response = requests.get(API_URL, params=params, timeout=5)
            if response.status_code == 200:
                data = response.json()
                pmids = data.get("esearchresult", {}).get("idlist", [])
                return doi, pmids[0] if pmids else None
            elif response.status_code == 429:
                print(f"Rate limit hit for DOI {doi}, sleeping for 2 seconds...")
                time.sleep(2)
            else:
                print(f"Error {response.status_code} for DOI: {doi}")
                return doi, None
        except requests.exceptions.RequestException as e:
            print(f"Request error for DOI {doi}: {e}")
        retries -= 1
        time.sleep(2)  # Delay before retrying
    return doi, None

results = []
start_time = time.time()

# Use tqdm to wrap the iteration over DOIs
for doi in tqdm(dois, desc="Processing DOIs"):
    doi_result, pmid = check_doi_in_pubmed(doi)
    results.append({"DOI": doi_result, "PMID": pmid})
    
    # Sleep 0.1 seconds to ensure not more than 10 requests per second.
    time.sleep(0.1)

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv("pubmed_results.csv", index=False)

end_time = time.time()
print(f"Finished checking {len(dois)} DOIs in {end_time - start_time:.2f} seconds.")
print("Results saved in 'pubmed_results.csv'.")

# async def check_doi_in_pubmed(session, doi, limiter):
#     params = {
#         "db": "pubmed",
#         "term": f"{doi}[DOI]",
#         "retmode": "json",
#         "api_key": API_KEY
#     }
#     retries = 3
#     while retries > 0:
#         async with limiter:
#             try:
#                 async with session.get(API_URL, params=params, timeout=5) as response:
#                     if response.status == 200:
#                         data = await response.json()
#                         pmids = data.get("esearchresult", {}).get("idlist", [])
#                         return {"DOI": doi, "PMID": pmids[0] if pmids else None}
#                     elif response.status == 429:
#                         print(f"Rate limit hit for DOI {doi}, sleeping for 2 seconds...")
#                         await asyncio.sleep(2)
#                     else:
#                         print(f"Error {response.status} for DOI: {doi}")
#                         return {"DOI": doi, "PMID": None}
#             except Exception as e:
#                 print(f"Request error for DOI {doi}: {e}")
#         retries -= 1
#         await asyncio.sleep(2)
#     return {"DOI": doi, "PMID": None}

# async def main():
#     limiter = AsyncLimiter(max_rate=5, time_period=1)
#     results = []
    
#     async with aiohttp.ClientSession() as session:
#         # Create a list of coroutine tasks—one for each DOI
#         tasks = [check_doi_in_pubmed(session, doi, limiter) for doi in dois]
        
#         # Use a regular for loop with tqdm to show progress.
#         # asyncio.as_completed returns a generator of futures (not an async iterable)
#         for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing DOIs"):
#             result = await future
#             results.append(result)
    
#     df = pd.DataFrame(results)
#     df.to_csv("pubmed_results.csv", index=False)
#     print("Results saved in 'pubmed_results.csv'.")

# if __name__ == "__main__":
#     asyncio.run(main())

Processing DOIs:   0%|          | 438/238703 [04:05<37:05:00,  1.78it/s]


KeyboardInterrupt: 

In [20]:
len(dois)

238703

In [21]:
238703/2

119351.5

In [10]:
import pandas as pd

# Load the two CSV files into pandas DataFrames
icite_df = pd.read_csv('icite_results.csv', sep=',')
biomedrxiv_df = pd.read_csv('filtered_biomedrxiv_data.csv', sep=',')

# Rename the 'doi' column in biomedrxiv_df to 'DOI' to match icite_df for merging
biomedrxiv_df = biomedrxiv_df.rename(columns={'doi': 'DOI'})

# ** --- ADD THESE LINES TO DEBUG --- **
print("Columns in icite_df:", icite_df.columns)
print("Columns in biomedrxiv_df:", biomedrxiv_df.columns)
print("Data type of DOI in icite_df:", icite_df['DOI'].dtype)
print("Data type of DOI in biomedrxiv_df:", biomedrxiv_df['DOI'].dtype)
# ** --- END DEBUG LINES --- **

# Perform a left merge
final_df = pd.merge(icite_df, biomedrxiv_df, on='DOI', how='left')

# Save the merged DataFrame to a new CSV file called 'final_data.csv'
final_df.to_csv('final_data.csv', index=False)

print("Merged data saved to final_data.csv")

Columns in icite_df: Index(['DOI', 'PMID', 'pmid', 'year', 'title', 'authors', 'journal',
       'is_research_article', 'relative_citation_ratio', 'nih_percentile',
       'human', 'animal', 'molecular_cellular', 'apt', 'is_clinical',
       'citation_count', 'citations_per_year', 'expected_citations_per_year',
       'field_citation_rate', 'provisional', 'x_coord', 'y_coord',
       'cited_by_clin', 'cited_by', 'references', 'doi', 'last_modified'],
      dtype='object')
Columns in biomedrxiv_df: Index(['title', 'DOI', 'authors', 'abstract', 'date', 'journal',
       'citation_number'],
      dtype='object')
Data type of DOI in icite_df: object
Data type of DOI in biomedrxiv_df: object
Merged data saved to final_data.csv


In [11]:
len(final_df)

29462

In [12]:
processed_df = final_df.drop(columns=["pmid", "year", "doi", "last_modified", "title_y", "authors_y", "journal_y", 'provisional', 'x_coord', 'y_coord'])
len(processed_df)

29462

In [13]:
processed_df = processed_df[processed_df['is_research_article'] == 'Yes']
len(processed_df)

29459

In [15]:
processed_df = processed_df.drop(columns=["cited_by_clin", "cited_by", "references", "is_research_article"])
processed_df.head()

Unnamed: 0,DOI,PMID,title_x,authors_x,journal_x,relative_citation_ratio,nih_percentile,human,animal,molecular_cellular,apt,is_clinical,citation_count,citations_per_year,expected_citations_per_year,field_citation_rate,abstract,date,citation_number
0,10.1101/2020.10.19.343954,33106802,"Single cell resolution of SARS-CoV-2 tropism, ...","Jessica K Fiege, Joshua M Thiede, Hezkiel Nand...",bioRxiv,0.03,2.3,0.25,0.0,0.75,0.05,No,1,0.2,6.008877,15.387986,The human airway epithelium is the initial sit...,2020-10-19,2.0
1,10.1101/2020.10.23.344085,33106810,Sterilizing Immunity against SARS-CoV-2 Infect...,"Sonia Jangra, Jana De Vrieze, Angela Choi, Rav...",bioRxiv,0.2,10.1,0.25,0.25,0.5,0.05,No,3,0.6,2.998857,7.35594,The search for vaccines that protect from seve...,2020-10-23,5.0
2,10.1101/2020.10.30.362749,33140043,Mechanism of ligand recognition by human ACE2 ...,"Apurba Bhattarai, Shristi Pawnikar, Yinglong Miao",bioRxiv,0.13,6.3,0.17,0.0,0.83,0.05,No,2,0.4,3.158375,7.781605,Angiotensin converting enzyme 2 (ACE2) plays a...,2020-11-01,2.0
3,10.1101/2020.10.27.358259,33140044,SARS-CoV-2 desensitizes host cells to interfer...,"Da-Yuan Chen, Nazimuddin Khan, Brianna J Close...",bioRxiv,0.61,32.7,0.29,0.0,0.71,0.25,No,12,2.4,3.966928,9.939177,"SARS-CoV-2 can infect multiple organs, includi...",2020-10-28,15.0
4,10.1101/2020.10.26.356048,33140045,ISG15-dependent Activation of the RNA Sensor M...,"GuanQun Liu, Jung-Hyun Lee, Zachary M Parker, ...",bioRxiv,0.73,38.7,0.0,0.0,1.0,0.05,No,14,2.8,3.858002,9.648515,"Activation of the RIG-I-like receptors, RIG-I ...",2020-10-27,15.0


In [16]:
len(processed_df)

29459

In [17]:
rcr_df = processed_df[processed_df['relative_citation_ratio'].notna()]
len(rcr_df)

13437

In [18]:
rcr_df.to_csv('final_rcr_data.csv', index=False)