<a href="https://colab.research.google.com/github/espickle1/claude-agents/blob/main/utils/reference_pubmed_id_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# PMC Reference Extractor & Abstract Fetcher
# ============================================================
# Input: PMC ID only
# Output: Two markdown files ready for Claude digest
# ============================================================

import requests
import re
import xml.etree.ElementTree as ET
import time
from google.colab import files

# ============================================================
# ONLY INPUT NEEDED - Enter PMC ID here
# ============================================================
pmc_id = "PMC11152113"  # @param {type:"string"}
# ============================================================

# Normalize ID
pmc_id_clean = pmc_id.replace("PMC", "").strip()

print(f"Processing PMC{pmc_id_clean}...\n")

# ----- STEP 1: Fetch PMC XML and extract source paper metadata -----
print("Step 1: Fetching source paper metadata...")
pmc_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={pmc_id_clean}&rettype=xml"
pmc_response = requests.get(pmc_url)
pmc_response.raise_for_status()
pmc_xml = pmc_response.text

# Extract source paper metadata
paper_title = re.search(r'<article-title[^>]*>(.*?)</article-title>', pmc_xml, re.DOTALL)
paper_title = re.sub(r'<[^>]+>', '', paper_title.group(1)).strip() if paper_title else "Unknown"

paper_pmid = re.search(r'<article-id pub-id-type="pmid">(\d+)</article-id>', pmc_xml)
paper_pmid = paper_pmid.group(1) if paper_pmid else "Unknown"

paper_doi = re.search(r'<article-id pub-id-type="doi">([^<]+)</article-id>', pmc_xml)
paper_doi = paper_doi.group(1) if paper_doi else "Unknown"

print(f"   Title: {paper_title[:60]}...")
print(f"   PMID: {paper_pmid}")
print(f"   DOI: {paper_doi}\n")

# ----- STEP 2: Extract reference PMIDs from <ref-list> only -----
print("Step 2: Extracting reference PMIDs...")
ref_list_match = re.search(r'<ref-list[^>]*>(.*?)</ref-list>', pmc_xml, re.DOTALL)

if ref_list_match:
    ref_section = ref_list_match.group(1)
    pmids = re.findall(r'<pub-id pub-id-type="pmid">(\d+)</pub-id>', ref_section)
    pmids = list(dict.fromkeys(pmids))  # dedupe, preserve order
else:
    pmids = []
    print("   WARNING: No <ref-list> section found")

print(f"   Found {len(pmids)} reference PMIDs\n")

# ----- STEP 3: Fetch abstracts and DOIs from PubMed -----
print(f"Step 3: Fetching {len(pmids)} abstracts from PubMed...")

def fetch_abstracts(pmid_list, batch_size=50):
    """Fetch metadata, abstracts, and DOIs from NCBI in batches"""
    results = []

    for i in range(0, len(pmid_list), batch_size):
        batch = pmid_list[i:i+batch_size]
        pmid_str = ",".join(batch)

        url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid_str}&rettype=xml"
        response = requests.get(url)
        response.raise_for_status()

        root = ET.fromstring(response.content)

        for article in root.findall(".//PubmedArticle"):
            pmid = article.findtext(".//PMID") or "Unknown"
            title = article.findtext(".//ArticleTitle") or "No title"
            journal = article.findtext(".//Journal/Title") or "Unknown journal"
            year = article.findtext(".//PubDate/Year") or article.findtext(".//PubDate/MedlineDate") or "N/A"

            # Get abstract (handle multiple AbstractText elements)
            abstract_parts = article.findall(".//AbstractText")
            if abstract_parts:
                abstract = " ".join([a.text or "" for a in abstract_parts if a.text])
            else:
                abstract = "No abstract available"

            # Get DOI
            doi = None
            for aid in article.findall(".//ArticleId"):
                if aid.get("IdType") == "doi":
                    doi = aid.text
                    break

            results.append({
                "pmid": pmid,
                "title": title,
                "journal": journal,
                "year": year,
                "doi": doi,
                "abstract": abstract[:2000]  # truncate very long abstracts
            })

        print(f"   Fetched {min(i+batch_size, len(pmid_list))}/{len(pmid_list)}")
        time.sleep(0.34)  # NCBI rate limit: 3 requests/sec

    return results

articles = fetch_abstracts(pmids)
print(f"   Retrieved {len(articles)} articles\n")

# ----- STEP 4: Generate OUTPUT FILE 1 - PMID List -----
print("Step 4: Generating output files...")

pmid_md = f"""# Reference PMIDs

## Source Paper
- **Title:** {paper_title}
- **PMCID:** PMC{pmc_id_clean}
- **PMID:** [{paper_pmid}](https://pubmed.ncbi.nlm.nih.gov/{paper_pmid}/)
- **DOI:** [{paper_doi}](https://doi.org/{paper_doi})

## Reference PMIDs ({len(pmids)} total)

```
{','.join(pmids)}
```

## PMID List

| # | PMID | PubMed Link |
|---|------|-------------|
"""

for i, p in enumerate(pmids, 1):
    pmid_md += f"| {i} | {p} | [Link](https://pubmed.ncbi.nlm.nih.gov/{p}/) |\n"

pmid_filename = f"pmids_PMC{pmc_id_clean}.md"
with open(pmid_filename, "w") as f:
    f.write(pmid_md)
print(f"   ✓ {pmid_filename}")

# ----- STEP 5: Generate OUTPUT FILE 2 - Abstracts for Claude -----
abstract_md = f"""# Reference Abstracts for Digest

## Source Paper
- **Title:** {paper_title}
- **PMCID:** PMC{pmc_id_clean}
- **PMID:** [{paper_pmid}](https://pubmed.ncbi.nlm.nih.gov/{paper_pmid}/)
- **DOI:** [{paper_doi}](https://doi.org/{paper_doi})

## Task for Claude
For each reference below, write ONE sentence summarizing the key finding.
Output as a markdown table: | # | PMID | Digest |

---

## References ({len(articles)} articles)

"""

for i, art in enumerate(articles, 1):
    doi_link = f"[{art['doi']}](https://doi.org/{art['doi']})" if art['doi'] else "No DOI"
    pmid_link = f"[{art['pmid']}](https://pubmed.ncbi.nlm.nih.gov/{art['pmid']}/)"

    abstract_md += f"""### [{i}] {art['title']}
- **PMID:** {pmid_link}
- **DOI:** {doi_link}
- **Journal:** {art['journal']} ({art['year']})

{art['abstract']}

---

"""

abstract_filename = f"abstracts_PMC{pmc_id_clean}.md"
with open(abstract_filename, "w") as f:
    f.write(abstract_md)
print(f"   ✓ {abstract_filename}")

# ----- SUMMARY -----
print(f"\n" + "="*60)
print("COMPLETE")
print("="*60)
print(f"Source: {paper_title[:50]}...")
print(f"References: {len(pmids)} PMIDs, {len(articles)} abstracts retrieved")
print(f"\nOutput files:")
print(f"  1. {pmid_filename} ({len(pmid_md):,} chars)")
print(f"  2. {abstract_filename} ({len(abstract_md):,} chars, ~{len(abstract_md)//4:,} tokens)")
print("="*60)

# Download both files
files.download(pmid_filename)
files.download(abstract_filename)