### Set NCBI API key (recommended)

In [None]:
!export NCBI_API_KEY=" " # enter your free API key (download from https://account.ncbi.nlm.nih.gov/settings/)

### Email address for Entrez

In [None]:
Entrez.email = ' ' # enter email for entrez

### Imports

In [None]:
import pandas as pd
import metapub
import matplotlib.pyplot as plt
from metapub import PubMedFetcher
import time
import re
from Bio import Entrez
import pyalex
from pyalex import Works, Topics, config

# PubMed search query:
- "NMR" OR "Nuclear Magnetic Resonance" OR "N.M.R" in the title or abstract text
- no preprints
- english language
- published between 2020/01/01-2024/12/31

### Count pubmed articles matching query

In [None]:
!esearch -db pubmed -query '("NMR"[Title/Abstract] OR "nuclear magnetic resonance"[Title/Abstract] OR "N.M.R"[Title/Abstract]) AND (("all"[Filter] NOT "preprint"[Publication Type]) AND "english"[Language] AND 2000/01/01:2024/12/31[Date - Publication])' | xtract -pattern ENTREZ_DIRECT -element Count

### Save pmids of all matched entries to txt file (may take few minutes)

In [None]:
!esearch -db pubmed -query '("NMR"[Title/Abstract] OR "nuclear magnetic resonance"[Title/Abstract] OR "N.M.R"[Title/Abstract]) AND (("all"[Filter] NOT "preprint"[Publication Type]) AND "english"[Language] AND 2000/01/01:2024/12/31[Date - Publication])' | efetch -format uid > pmids.txt

In [None]:
#with open('./pmids.txt', 'r') as file:
with open('./pmids.txt', 'r') as file:
    pmids=file.read().splitlines()
    
print('There are {} pmids'.format(len(pmids)))

In [None]:
def fetch_pmid_records(pmids, batch_size=200, delay=0.5):
    records = []
    
    for start in range(0, len(pmids), batch_size):
        print(start)
        end = start + batch_size
        batch_pmids = pmids[start:end]
        try:
            handle = Entrez.efetch(db="pubmed", id=batch_pmids, retmode="xml")
            data = Entrez.read(handle)
            handle.close()
            for article in data['PubmedArticle']:
                pmid = article['MedlineCitation']['PMID']
                records.append({'pmid': str(pmid),
                              'year_pubmed': int(article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']),
                              'journal_title': article['MedlineCitation']['Article']['Journal']['Title'],
                              'title_pubmed': article['MedlineCitation']['Article']['ArticleTitle']})
            time.sleep(delay)  # Respect NCBI's rate limits
        except:
            pass
    return records

### Line below executes PubMed batched query for all stored PMIDS (around 200k records), may take 1-2h to finish.

In [None]:
pubmed_data = fetch_pmid_records(pmids)

### Convert pubmed data to dataframe

In [None]:
pubmed_df = pd.DataFrame(pubmed_data)

In [None]:
pubmed_df.head()

### Write fetched PubMed data to disk just in case (checkpoint)

In [None]:
pubmed_df.to_pickle('df_pubmed.pkl')

# Update PubMed dataframe with OpenAlex data

In [None]:
pubmed_df = pd.read_pickle('./df_pubmed.pkl')

In [None]:
pubmed_df.shape

In [None]:
pubmed_df.columns

### PyAlex config

In [None]:
config.max_retries = 0
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

In [None]:
def fetch_openalex(pmids, batch_size=100, delay=0.25):
    all_data = []
    
    for start in range(0, len(pmids), batch_size):
        if (start // batch_size)%100==0:
            print(start // batch_size)
        end = start + batch_size
        batch_pmids = pmids[start:end]
        pmid_filter = {"ids": {"pmid": "|".join(batch_pmids)}}
        try:
            pages = Works().filter(**pmid_filter)\
                   .select(['ids', 'cited_by_count', 'authorships', 'open_access', 'referenced_works']).paginate(per_page=100, n_max=None)
            for page in pages:
                all_data.extend(page)

            time.sleep(delay)  # to not overload openalex
        except:
            pass
    return all_data

### Line below executes OpenAlex batched query for all stored PMIDS (around 200k records), may take 1-2h to finish.

In [None]:
open_alex_data = fetch_openalex(list(pubmed_df.pmid.values))
df_openalex = pd.DataFrame(open_alex_data)
df_openalex.to_pickle('df_openalex_pmids.pkl') # save complementary open alex data just in case

In [None]:
df_openalex.head()

# Join pubmed and openalex dataframes based on PMID

In [None]:
df_pubmed = pd.read_pickle('./df_pubmed.pkl')
df_openalex = pd.read_pickle('./df_openalex_pmids.pkl')

In [None]:
print('Pubmed dataframe shape: {}'.format(df_pubmed.shape))
print('Pubmed dataframe columns: {}'.format(list(df_pubmed.columns)))
print('OpenAlex dataframe shape: {}'.format(df_openalex.shape))
print('OpenAlex dataframe columns: {}'.format(list(df_openalex.columns)))

### Create pmid column in openalex dataframe

In [None]:
def extr_pmid(ids):
    pmid_pattern = r'/(\d+)/?$'
    try:
        match = re.search(pmid_pattern, ids['pmid'])
        if match:
            pmid = match.group(1)
            return pmid
        else:
            return None
    except:
        return None

In [None]:
df_openalex['pmid'] = df_openalex['ids'].apply(lambda x: extr_pmid(x))

### Join dataframes

In [None]:
df_pubmed.set_index('pmid', inplace=True)
df_openalex.set_index('pmid', inplace=True)

In [None]:
df = df_pubmed.join(df_openalex, on='pmid')

In [None]:
df.head()

In [None]:
print('Joined dataframe shape: {}'.format(df.shape))
print('Joined dataframe columns: {}'.format(list(df.columns)))

### Drop rows where openalex did not find complementary data

In [None]:
print('numer of rows to drop: {}'.format(sum(df.ids.isnull())))
df = df.dropna(subset=['ids'])
print('Final dataframe shape: {}'.format(df.shape))

In [None]:
df.reset_index(inplace=True)

In [None]:
df.head()

# Save combined PubMed and OpenAlex data

In [None]:
df.to_pickle('./df_pubmed_openalex_combined.pkl')