In [None]:
import requests
import os
from Bio import Entrez
import json
import re
from tqdm import tqdm

In [None]:
# Set up NCBI access
Entrez.email = "<NCBI-email>"

In [None]:
# Search for DRD2 and addiction papers
def search_pubmed(query): 
    handle = Entrez.esearch(db="pubmed", term=query, retmax = 700) #i was finding around 600 hits for my specific query. this could be adjust as needed, retmax defaults to 20 results.
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

In [None]:
# Search and save results
id_list = search_pubmed("DRD2 AND (\"pharmacogenetics\" OR \"therapeutic use\" OR \"adverse effects\" OR \"drug response\" OR \"clinical trials\")") 
#This got 628 hits.


In [None]:
# Fetch paper metadata and download PDFs where available
def fetch_papers(id_list):
    papers = []
    for pmid in tqdm(id_list, desc = "Retrieving documents..."):
        # Get paper metadata
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()
        
        stripped = " ".join(record.split())
        
        try:
            title = re.search('TI - (.+?) [A-Z]+ -', stripped).group(1) #Title is sandwiched between TI - and some string of capital letters
        except AttributeError:
            # skips abstracts that have no retrievable title
            continue
        try:
            abstract = re.search('AB - (.+?) [A-Z]+ -', stripped).group(1) 
        except AttributeError:
            # skips abstracts that have no retrievable abstract
            continue
        try:
            first_author = re.search('FAU - (.+?) [A-Z]+ -', stripped).group(1) 
        except AttributeError:
            # skips abstracts that have no retrievable author
            continue
        try:
            date_published = re.search('DP - (.+?) [A-Z]+ -', stripped).group(1)
        except AttributeError:
            # skips abstracts that have no retrievable date
            continue
        
        papers.append({
            "pmid": pmid,
            "title": title,
            "abstract": abstract,
            "first_author": first_author,
            "date_published": standardize_month_name(date_published)
        })
    return papers

In [None]:
papers = fetch_papers(id_list) #takes about 4 minutes

In [None]:
# Save the collected data
with open('drd2_papers.json', 'w') as f:
    json.dump(papers, f)