In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

# ============================================================
# 1. PubMed Search (Abstracts + Metadata)
# ============================================================

PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

QUERY = "Alzheimer disease therapeutic targets"
MAX_RESULTS = 30

params = {
    "db": "pubmed",
    "term": QUERY,
    "retmax": MAX_RESULTS,
    "retmode": "json"
}

search_resp = requests.get(PUBMED_SEARCH_URL, params=params)
ids = search_resp.json()["esearchresult"]["idlist"]

articles = []

for pmid in tqdm(ids):
    fetch_params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml"
    }

    xml = requests.get(PUBMED_FETCH_URL, params=fetch_params).text
    soup = BeautifulSoup(xml, "xml")

    abstract = soup.find("AbstractText")
    title = soup.find("ArticleTitle")
    pmcid_tag = soup.find("ArticleId", {"IdType": "pmc"})

    articles.append({
        "pmid": pmid,
        "pmcid": pmcid_tag.text if pmcid_tag else None,
        "title": title.text if title else "",
        "abstract": abstract.text if abstract else ""
    })

pubmed_df = pd.DataFrame(articles)

# ============================================================
# 2. Europe PMC Full-Text Retrieval (Introduction + Conclusion)
# ============================================================

def fetch_full_text(pmcid):
    if not pmcid:
        return None
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
    r = requests.get(url)
    if r.status_code != 200:
        return None
    return r.text


def extract_sections(xml_text):
    if not xml_text:
        return {"introduction": "", "conclusion": ""}

    soup = BeautifulSoup(xml_text, "xml")
    intro, concl = "", ""

    for sec in soup.find_all("sec"):
        title = sec.title.text.lower() if sec.title else ""
        text = " ".join(p.text for p in sec.find_all("p"))

        if "introduction" in title:
            intro += " " + text

        if "conclusion" in title or "discussion" in title:
            concl += " " + text

    return {
        "introduction": intro.strip(),
        "conclusion": concl.strip()
    }


introductions = []
conclusions = []

for pmcid in tqdm(pubmed_df.pmcid):
    xml = fetch_full_text(pmcid)
    sections = extract_sections(xml)
    introductions.append(sections["introduction"])
    conclusions.append(sections["conclusion"])

pubmed_df["introduction"] = introductions
pubmed_df["conclusion"] = conclusions

# ============================================================
# 3. Text Cleaning
# ============================================================

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    return text.strip()

for col in ["abstract", "introduction", "conclusion"]:
    pubmed_df[col] = pubmed_df[col].astype(str).apply(clean_text)

# ============================================================
# 4. Combine Sections (Assignment Requirement)
# ============================================================

def combine_sections(row):
    parts = []
    if row.abstract:
        parts.append("ABSTRACT: " + row.abstract)
    if row.introduction:
        parts.append("INTRODUCTION: " + row.introduction)
    if row.conclusion:
        parts.append("CONCLUSION: " + row.conclusion)
    return "\n\n".join(parts)

pubmed_df["full_text"] = pubmed_df.apply(combine_sections, axis=1)

# ============================================================
# 5. Exploratory Data Analysis (Light)
# ============================================================

pubmed_df["text_length"] = pubmed_df.full_text.apply(len)
pubmed_df[["text_length"]].describe()

# ============================================================
# 6. Chunking for RAG (No LangChain)
# ============================================================

def split_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += chunk_size - overlap

    return chunks


rows = []

for _, row in pubmed_df.iterrows():
    chunks = split_text(row.full_text)
    for ch in chunks:
        rows.append({
            "text": ch,
            "pmid": row.pmid,
            "title": row.title
        })

chunks_df = pd.DataFrame(rows)

# ============================================================
# 7. Save Prepared Dataset
# ============================================================

chunks_df.to_csv("/content/drive/MyDrive/data/pubmed_chunks.csv", index=False)

print(f"Saved {len(chunks_df)} chunks for RAG modeling")



100%|██████████| 30/30 [00:07<00:00,  3.90it/s]
100%|██████████| 30/30 [00:17<00:00,  1.68it/s]


Saved 85 chunks for RAG modeling
