In [1]:
from turtle import pu
import requests
import time
import pandas as pd
import random

In [2]:
BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"

In [23]:
def search(query, cursor_mark):
    params = {
        'resultType': 'core',
        'format': 'json',
        'synonym': 'N',           # exact keyword
        'query': query,
        'pageSize': 1000,  
        'cursorMark': cursor_mark,
    }

    return requests.get(BASE_URL, params=params)

In [24]:
url_params = '''
(
   dictyostelium   
) 
AND (
    (HAS_FT:Y) AND 
    (SRC:MED OR SRC:PMC )
)
'''
pubmed_ids = []
cursor_mark = '*'

In [19]:
# more filters
# url_params = '''
# (
#    dictyostelium   
# ) 
# AND (
#     (FIRST_PDATE:[2000 TO 2025]) AND
#     (HAS_FT:Y) AND 
#     (((SRC:MED OR SRC:PMC OR SRC:AGR OR SRC:CBA) NOT (PUB_TYPE:"Review")))
# )
# '''

In [25]:
while True:
    response = search(url_params, cursor_mark)

    if len(pubmed_ids) == 1000: # <------------------------------remove this limit to get all results
        break 

    if response.status_code != 200:
        print(response.text)
        break
    data = response.json()
    print(data['hitCount'], len(data['resultList']['result']))

    if len(data['resultList']['result']) == 0:
        break

    for result in data['resultList']['result']: 
        if result['source'] in ('MED', 'PMC'):

            journal_info = result.get('journalInfo')
            journal = None
            if journal_info:
                # Medline abbreviation if available, otherwise journal title
                if 'medlineAbbreviation' in journal_info['journal']:
                    journal = journal_info['journal']['medlineAbbreviation']
                else:
                    journal = journal_info['journal']['title']


            pub_type = result['pubTypeList']['pubType']
            # if 'Abstract' in pub_type:
            #     continue

            # is_review = 'review-article' in pub_type or 'Review' in pub_type

            is_open_access = result['isOpenAccess'] == 'Y'
            cited_by_count = result['citedByCount']
            pub_year = result.get('pubYear')
            language = result.get('language')
            pub_type_list = ';'.join(result['pubTypeList']['pubType'])

            if 'keywordList' in result:
                keywords = ';'.join([keyword if keyword is not None else '' for keyword in result['keywordList']['keyword']])

            else:
                keywords = None

            # print(result['pmid'], result.get('pmcid'), pub_date, medline_abbreviation)
            pubmed_ids.append(
                (
                    # result.get('pmid'),
                    result.get('pmcid'),
                    int(is_open_access),
                    int(cited_by_count),
                    language,
                    pub_type_list,
                    pub_year,
                    journal,
                    result.get('title'),
                    keywords
                )
            )

    if 'nextCursorMark' not in data:
        break

    cursor_mark = data['nextCursorMark']

    time.sleep(random.uniform(0.5, 1.5))

# print(
#     f'Year: {year}, Number of articles saved: {len(pubmed_ids)} out of total {data["hitCount"]}'
# )

16027 1000


In [26]:
df = pd.DataFrame(
    pubmed_ids,
    columns=[
        # 'pmid',
        'pmcid',
        'is_open_access',
        'cited_by_count',
        'language',
        'pub_type_list',
        'pub_year',
        'journal',
        'title',
        'keywords',

    ],
)

In [27]:
df

Unnamed: 0,pmcid,is_open_access,cited_by_count,language,pub_type_list,pub_year,journal,title,keywords
0,PMC12572271,1,0,eng,research-article;Journal Article,2025,Sci Rep,Dictyostelium exhibits PCB-induced impairment ...,Iron homeostasis;Dictyostelium;Cellular Toxici...
1,PMC12143694,1,0,eng,research-article;Journal Article,2025,Cell Adh Migr,Nhe1 is required for directional sensing in ve...,Chemotaxis;Dictyostelium;cell migration;Nhe1;E...
2,PMC12505270,1,1,eng,research-article;Journal Article,2025,Biol Open,Differential PaxillinB dynamics at Dictyosteli...,Dictyostelium discoideum;cell migration;Adhesi...
3,PMC12547847,1,0,eng,Historical Article;other;Interview,2025,Biol Open,First person - Julio Fierro Morales.,
4,PMC12362907,1,0,eng,Introductory Journal Article;Editorial,2025,BMC Mol Cell Biol,Cell biology of Dictyostelium.,
...,...,...,...,...,...,...,...,...,...
995,PMC11180820,1,5,eng,review-article;Review;Journal Article,2024,Front Cell Dev Biol,Methods and computational tools to study eukar...,Chemotaxis;Trajectory analysis;cell migration;...
996,PMC12415853,1,0,eng,research-article;Journal Article,2025,Synth Biol (Oxf),GoldenBraid2.0 &lt;i&gt;E. coli&lt;/i&gt;: a c...,Genetic engineering;Cloning;Molecular;Gram-neg...
997,PMC7192593,1,4,eng,"Research Support, Non-U.S. Gov't;research-arti...",2020,Nucleic Acids Res,"DIRS retrotransposons amplify via linear, sing...",
998,PMC6594445,1,16,eng,"Research Support, Non-U.S. Gov't;research-arti...",2019,Mol Biol Cell,Ate1-mediated posttranslational arginylation a...,


In [28]:
summary = (df["pub_year"]
           .value_counts(dropna=False)
           .rename_axis("pub_year")
           .reset_index(name="count")
           .sort_values("pub_year"))

summary

Unnamed: 0,pub_year,count
8,2018,1
6,2019,37
5,2020,58
4,2021,111
3,2022,112
2,2023,142
1,2024,235
0,2025,303
7,2026,1


In [1]:
df.to_csv(f'yun-dicty.csv', index=False)

9767 1000


# The below code will be used to get the BioC format for the articles and store them on disk.

files will be saved as pmcid.json.gz

In [29]:
import requests
import time
import json 
import random
import os
import pandas as pd
import gzip


pmcids = list(df['pmcid'])[:5] # <------------------------------remove this limit to get all results


# # print(len(missing_pmcids - processed))

for pmcid in pmcids:
    url = f'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmcid}/unicode'
    response = requests.get(url)
    if response.status_code == 200:
        if 'No record can be found for the input:' in response.text or 'No result can be found' in response.text:
            print(f'No record for {pmcid}', flush=True)
            continue
        with gzip.open(f'{pmcid}.json.gz', 'wt') as f:
            f.write(json.dumps(response.json()[0], indent=2))
        print(f'{pmcid} saved', flush=True)
        # print(response.json())
    else:
        print(f'Error: {response.status_code}', flush=True)
    
    time.sleep(random.uniform(2, 5))


PMC12572271 saved
PMC12143694 saved
PMC12505270 saved
PMC12547847 saved
PMC12362907 saved
