Reading the CSV to fetch


In [6]:
# ============================
#
# Reads Title/Link CSV -> mines accessions/keywords -> queries OSDR -> writes outputs
# ============================

# --- Imports
import os, re, time, json, csv, math, html
import requests
import pandas as pd
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from datetime import datetime
from typing import List, Dict, Any

# --- Config
INPUT_CSV  = "../../datasets/SB_publication_PMC.csv"   # expects columns: Title, Link
OUT_CSV    = "../../datasets/paper_to_osdr_candidates.csv"
OUT_JSONL  = "../../datasets/paper_to_osdr_candidates.jsonl"

# Respectful rate-limits (NCBI guidance ~3 req/sec; we stay slower)
SLEEP_EFETCH = 0.35
SLEEP_OSDR   = 0.20
SLEEP_HTML   = 0.20
MAX_ROWS     = None      # set e.g. 50 to test a subset; None = all

# --- Endpoints
EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
PMC_ARTICLE = "https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"
PUBMED_ESUMMARY = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
OSDR_QUERY = "https://visualization.osdr.nasa.gov/biodata/api/v2/query/metadata/"

# --- Patterns
ACC_PATTERNS = [r"GLDS-\d+", r"OSD-\d+", r"\bOSD\s*\d+\b"]
MISSION_PATTERNS = [
    r"\bBion[-\s]?M\s*1\b", r"\bBion[-\s]?M\s*2\b",
    r"\bRodent\s*Research\b", r"\bRR[-\s]?\d+\b",
    r"\bISS\b", r"\bSpaceX[-\s]?\d+\b", r"\bShuttle\b",
]
ORGANISM_PATTERNS = [
    r"\bMus\s+musculus\b", r"\bmouse\b", r"\bmice\b",
    r"\bRattus\s+norvegicus\b", r"\brat(s)?\b",
    r"\bArabidopsis\s+thaliana\b", r"\bDrosophila\b",
    r"\bSaccharomyces\s+cerevisiae\b", r"\bZebrafish\b",
]
ASSAY_PATTERNS = [
    r"\bRNA-?seq\b", r"\bmicroarray\b", r"\bproteomics?\b",
    r"\bmetabolomics?\b", r"\btranscriptomics?\b",
    r"\bhistolog(y|ies)\b", r"\bmicroscop(y|ies)\b",
]

# --- Helpers
def pmcid_from_url(url: str) -> str:
    if not url:
        return ""
    m = re.search(r"(PMC\d+)", url)
    return m.group(1) if m else ""

def fetch_pmc_xml_text(pmcid: str) -> str:
    """Fetch PMC full text via efetch XML; return crude text or '' if fails."""
    if not pmcid: return ""
    try:
        pmc_num = pmcid.replace("PMC","")
        r = requests.get(EFETCH, params={"db":"pmc","id":pmc_num,"retmode":"xml"}, timeout=30)
        r.raise_for_status()
        # quick-and-dirty text extraction from XML
        txt = " ".join(re.findall(r">([^<>]+)<", r.text))
        time.sleep(SLEEP_EFETCH)
        return html.unescape(txt)
    except Exception:
        return ""

def fetch_pmc_html_text(pmcid: str) -> str:
    """Fallback: scrape visible text <p> from PMC HTML page."""
    if not pmcid: return ""
    try:
        url = PMC_ARTICLE.format(pmcid=pmcid)
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
        time.sleep(SLEEP_HTML)
        return "\n".join(paragraphs)
    except Exception:
        return ""

def combined_pmc_text(pmcid: str) -> str:
    txt = fetch_pmc_xml_text(pmcid)
    if len(txt) < 1000:  # try HTML if XML too short or missing
        html_txt = fetch_pmc_html_text(pmcid)
        if len(html_txt) > len(txt):
            txt = html_txt
    return txt

def regex_find_all(patterns: List[str], text: str) -> set:
    found = set()
    for pat in patterns:
        for m in re.findall(pat, text, flags=re.I):
            found.add(m.strip())
    return found

def osdr_validate_accession(acc: str) -> List[Dict[str,Any]]:
    """Query OSDR for a specific accession -> return rows if found."""
    try:
        params = {"id.accession": acc, "format":"json", "=study":""}
        r = requests.get(OSDR_QUERY, params=params, timeout=30)
        if r.status_code == 200:
            time.sleep(SLEEP_OSDR)
            return r.json().get("rows", [])
    except Exception:
        pass
    return []

def osdr_search(q: str) -> List[Dict[str,Any]]:
    """Loose keyword search in study metadata (mission/species/assay terms)."""
    try:
        params = {"=study":"", "q": q, "format":"json"}
        r = requests.get(OSDR_QUERY, params=params, timeout=30)
        if r.status_code == 200:
            time.sleep(SLEEP_OSDR)
            return r.json().get("rows", [])
    except Exception:
        pass
    return []

def normalize_field(hit: Dict[str,Any], *keys) -> str:
    for k in keys:
        v = hit.get(k)
        if v: return str(v)
    return ""

def rank_candidates(missions:set, orgs:set, assays:set, pub_year:int=None, topk:int=5) -> List[Dict[str,Any]]:
    seen = {}
    queries = list(missions) or list(orgs) or list(assays) or []
    for q in queries:
        for h in osdr_search(q)[:25]:
            acc = normalize_field(h, "id.accession", "accession")
            if not acc: continue
            m = normalize_field(h, "study.mission", "mission").lower()
            o = normalize_field(h, "organism", "study.organism").lower()
            a = normalize_field(h, "assay", "study.assay_type").lower()
            ptype = normalize_field(h, "projectType", "study.project_type").lower()

            mission_match = any(k.lower() in m for k in missions) if m else False
            org_match     = any(k.lower() in o for k in orgs)     if o else False
            assay_match   = any(k.lower() in a for k in assays)   if a else False
            type_match    = bool(ptype and ("space" in ptype or "flight" in ptype))

            # (Optional) try to derive a year if present anywhere obvious
            year = None
            for k in ("study.publicationYear","publicationYear","year"):
                if h.get(k):
                    try:
                        year = int(str(h[k])[:4])
                        break
                    except:
                        pass
            time_bonus = 1 if (pub_year and year and abs(pub_year - year) <= 3) else 0

            score = 3*mission_match + 2*org_match + 2*assay_match + 1*type_match + time_bonus

            cur = seen.get(acc, {"score":0, "h":h})
            if score > cur["score"]:
                seen[acc] = {"score":score, "h":h}

    ranked = sorted(seen.values(), key=lambda x:x["score"], reverse=True)[:topk]
    results = []
    for r in ranked:
        h = r["h"]
        results.append({
            "accession": normalize_field(h, "id.accession", "accession"),
            "score": r["score"],
            "organism": normalize_field(h, "organism", "study.organism"),
            "assay": normalize_field(h, "assay", "study.assay_type"),
            "project_type": normalize_field(h, "projectType", "study.project_type"),
            "mission": normalize_field(h, "study.mission", "mission"),
            "data_source": normalize_field(h, "dataSource", "study.data_source"),
        })
    return results

def pmid_from_pmc(pmcid: str) -> str:
    """Try to scrape PMID from the PMC article page."""
    if not pmcid: return ""
    try:
        html_page = requests.get(PMC_ARTICLE.format(pmcid=pmcid), timeout=30).text
        m = re.search(r"PMID:\s*<a[^>]*>(\d+)</a>", html_page)
        return m.group(1) if m else ""
    except Exception:
        return ""

def pubmed_meta(pmid: str) -> Dict[str,Any]:
    """Get compact PubMed metadata for nicer UI cards."""
    if not pmid: return {}
    try:
        r = requests.get(PUBMED_ESUMMARY, params={"db":"pubmed","id":pmid,"retmode":"json"}, timeout=30)
        r.raise_for_status()
        js = r.json()["result"].get(pmid, {})
        meta = {
            "pmid": pmid,
            "title": js.get("title",""),
            "journal": js.get("fulljournalname") or js.get("source",""),
            "year": (js.get("pubdate") or js.get("epubdate",""))[:4],
            "authors": [a.get("name") for a in js.get("authors",[]) if a.get("name")],
        }
        time.sleep(0.35)
        return meta
    except Exception:
        return {}

def find_terms(text: str, patterns: List[str]) -> set:
    s=set()
    for p in patterns:
        s |= set(re.findall(p, text, flags=re.I))
    # normalize to lowercase display-friendly
    return {t.strip() for t in s}

def guess_pub_year_from_text(text: str) -> int:
    """Very rough heuristic if you want a year for ranking; optional."""
    m = re.search(r"(20\d{2})", text)
    try:
        y = int(m.group(1))
        if 2000 <= y <= datetime.now().year:
            return y
    except Exception:
        pass
    return None

# --- Load input
df = pd.read_csv(INPUT_CSV)
if MAX_ROWS:
    df = df.head(MAX_ROWS)

rows_out = []

# --- Process each paper
for idx, rec in df.iterrows():
    title = str(rec.get("Title","")).strip()
    link  = str(rec.get("Link","")).strip()
    pmcid = pmcid_from_url(link)

    # 1) Pull text (efetch XML -> HTML fallback)
    text = combined_pmc_text(pmcid) if pmcid else ""
    # 2) Mine terms
    accessions = regex_find_all(ACC_PATTERNS, text)
    missions   = find_terms(text, MISSION_PATTERNS)
    orgs       = find_terms(text, ORGANISM_PATTERNS)
    assays     = find_terms(text, ASSAY_PATTERNS)

    # (Optional) try PubMed meta
    pmid = pmid_from_pmc(pmcid)
    meta = pubmed_meta(pmid) if pmid else {}

    # Try to infer publication year
    pub_year = None
    if meta.get("year"):
        try:
            pub_year = int(meta["year"])
        except:
            pub_year = None
    if not pub_year:
        pub_year = guess_pub_year_from_text(text)

    # 3) Validate any exact accessions with OSDR
    validated = []
    for acc in sorted(accessions):
        md_rows = osdr_validate_accession(acc)
        for h in md_rows:
            validated.append({
                "accession": normalize_field(h, "id.accession", "accession"),
                "organism": normalize_field(h, "organism", "study.organism"),
                "assay": normalize_field(h, "assay", "study.assay_type"),
                "project_type": normalize_field(h, "projectType", "study.project_type"),
                "mission": normalize_field(h, "study.mission", "mission"),
                "data_source": normalize_field(h, "dataSource", "study.data_source"),
            })

    # 4) If none found, rank fallback candidates via mission/org/assay
    candidates = []
    if not validated and (missions or orgs or assays):
        candidates = rank_candidates(missions, orgs, assays, pub_year=pub_year, topk=5)

    # 5) Pack result row
    row = {
        "title": title,
        "pmc_url": link,
        "pmcid": pmcid,
        "pmid": pmid,
        "pub_year": pub_year or "",
        "found_accessions_text": ";".join(sorted(accessions)) if accessions else "",
        "mission_terms_text": ";".join(sorted(missions)) if missions else "",
        "organism_terms_text": ";".join(sorted(orgs)) if orgs else "",
        "assay_terms_text": ";".join(sorted(assays)) if assays else "",
        "validated_datasets": json.dumps(validated, ensure_ascii=False),
        "fallback_osdr_candidates": json.dumps(candidates, ensure_ascii=False),
        "pubmed_meta": json.dumps(meta, ensure_ascii=False),
    }
    rows_out.append(row)

# --- Write outputs
df_out = pd.DataFrame(rows_out)
df_out.to_csv(OUT_CSV, index=False)
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for r in rows_out:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

# --- Summary
total = len(df_out)
exact = (df_out["found_accessions_text"].astype(str) != "").sum()
validated_ct = df_out["validated_datasets"].apply(lambda s: 1 if s and s != "[]" else 0).sum()
fallback_ct  = df_out["fallback_osdr_candidates"].apply(lambda s: 1 if s and s != "[]" else 0).sum()

print(f"[DONE] wrote:\n - {OUT_CSV}\n - {OUT_JSONL}")
print(f"Rows processed: {total}")
print(f"Had explicit accession text: {exact}")
print(f"Validated via OSDR: {validated_ct}")
print(f"Fallback candidates proposed: {fallback_ct}")


[DONE] wrote:
 - ../../datasets/paper_to_osdr_candidates.csv
 - ../../datasets/paper_to_osdr_candidates.jsonl
Rows processed: 607
Had explicit accession text: 108
Validated via OSDR: 0
Fallback candidates proposed: 0


In [8]:
# ============================
# Build a clean references dataset from paper_to_osdr_candidates.csv
# ============================
import json, math, pandas as pd

IN = "../../datasets/paper_to_osdr_candidates.csv"
OUT_CSV = "../../datasets/osdr_references.csv"
OUT_JSONL = "../../datasets/osdr_references.jsonl"

def parse_json_list(x):
    if not isinstance(x, str) or not x.strip():
        return []
    try:
        data = json.loads(x)
        if isinstance(data, list):
            return data
        return []
    except Exception:
        return []

def split_accessions(acc_str):
    if not isinstance(acc_str, str) or not acc_str.strip():
        return []
    parts = [a.strip() for a in acc_str.split(";") if a.strip()]
    return parts

df = pd.read_csv(IN)

rows = []
for _, r in df.iterrows():
    title   = r.get("title","")
    pmcid   = r.get("pmcid","") or ""
    pmid    = str(r.get("pmid","") or "")
    pmc_url = r.get("pmc_url","") or ""
    year    = str(r.get("pub_year","") or "")

    # mined terms (for context)
    mission_terms = (r.get("mission_terms_text","") or "")
    organism_terms = (r.get("organism_terms_text","") or "")
    assay_terms = (r.get("assay_terms_text","") or "")

    # A) VALIDATED via OSDR REST (highest confidence)
    validated = parse_json_list(r.get("validated_datasets",""))
    for v in validated:
        acc   = v.get("accession") or ""
        if not acc:
            continue
        mission  = v.get("mission") or ""
        organism = v.get("organism") or ""
        assay    = v.get("assay") or ""
        ptype    = v.get("project_type") or ""
        source   = v.get("data_source") or ""
        # URLs
        osdr_view = f"https://visualization.osdr.nasa.gov/biodata/dataset/{acc}/"
        osdr_api  = f"https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/{acc}/"
        pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else ""

        rows.append({
            "paper_pmcid": pmcid,
            "paper_pmid": pmid,
            "paper_title": title,
            "paper_year": year,
            "paper_url_pmc": pmc_url,
            "paper_url_pubmed": pubmed_url,
            "dataset_accession": acc,
            "dataset_osdr_view": osdr_view,
            "dataset_osdr_api": osdr_api,
            "mission": mission,
            "organism": organism,
            "assay": assay,
            "project_type": ptype,
            "data_source": source,
            "evidence_type": "validated",
            "confidence": 1.00,
            "evidence_notes": "Linked via OSDR REST dataset endpoint",
            "mined_mission_terms": mission_terms,
            "mined_organism_terms": organism_terms,
            "mined_assay_terms": assay_terms,
        })

    # B) EXPLICIT accession(s) in text that we couldn't validate (still useful)
    explicit_unvalidated = split_accessions(r.get("found_accessions_text",""))
    for acc in explicit_unvalidated:
        # avoid duplicating previously added validated links
        if any((acc == x["dataset_accession"] and x["paper_pmcid"] == pmcid) for x in rows):
            continue

        osdr_view = f"https://visualization.osdr.nasa.gov/biodata/dataset/{acc}/"
        osdr_api  = f"https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/{acc}/"
        pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else ""

        rows.append({
            "paper_pmcid": pmcid,
            "paper_pmid": pmid,
            "paper_title": title,
            "paper_year": year,
            "paper_url_pmc": pmc_url,
            "paper_url_pubmed": pubmed_url,
            "dataset_accession": acc,
            "dataset_osdr_view": osdr_view,
            "dataset_osdr_api": osdr_api,
            "mission": "",
            "organism": "",
            "assay": "",
            "project_type": "",
            "data_source": "",
            "evidence_type": "explicit_unvalidated",
            "confidence": 0.80,  # found in text but not confirmed via OSDR REST
            "evidence_notes": "Accession string found in article text",
            "mined_mission_terms": mission_terms,
            "mined_organism_terms": organism_terms,
            "mined_assay_terms": assay_terms,
        })

    # C) INFERRED candidates from structured OSDR search (if present)
    inferred = parse_json_list(r.get("fallback_osdr_candidates",""))
    # Normalize score to 0..1 for display; max nominal score in earlier logic ~7-8
    max_score_seen = max((c.get("score",0) for c in inferred), default=0) or 1
    for c in inferred:
        acc = c.get("accession") or ""
        if not acc:
            continue
        # avoid duplicates with validated/explicit_unvalidated
        if any((acc == x["dataset_accession"] and x["paper_pmcid"] == pmcid) for x in rows):
            continue

        mission  = c.get("mission") or ""
        organism = c.get("organism") or ""
        assay    = c.get("assay") or ""
        ptype    = c.get("project_type") or ""
        source   = c.get("data_source") or ""
        score    = float(c.get("score",0))
        conf     = round(min(1.0, max(0.25, score / max(8.0, max_score_seen))), 2)

        osdr_view = f"https://visualization.osdr.nasa.gov/biodata/dataset/{acc}/"
        osdr_api  = f"https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/{acc}/"
        pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else ""

        rows.append({
            "paper_pmcid": pmcid,
            "paper_pmid": pmid,
            "paper_title": title,
            "paper_year": year,
            "paper_url_pmc": pmc_url,
            "paper_url_pubmed": pubmed_url,
            "dataset_accession": acc,
            "dataset_osdr_view": osdr_view,
            "dataset_osdr_api": osdr_api,
            "mission": mission,
            "organism": organism,
            "assay": assay,
            "project_type": ptype,
            "data_source": source,
            "evidence_type": "inferred",
            "confidence": conf,
            "evidence_notes": "Ranked via mission/organism/assay overlap",
            "mined_mission_terms": mission_terms,
            "mined_organism_terms": organism_terms,
            "mined_assay_terms": assay_terms,
        })

# Build DataFrame
ref_df = pd.DataFrame(rows)

# Helpful sort: validated first, then explicit_unvalidated, then inferred by confidence
etype_rank = {"validated": 0, "explicit_unvalidated": 1, "inferred": 2}
ref_df["evidence_rank"] = ref_df["evidence_type"].map(etype_rank).fillna(3).astype(int)
ref_df = ref_df.sort_values(["evidence_rank","confidence"], ascending=[True, False]).reset_index(drop=True)
ref_df = ref_df.drop(columns=["evidence_rank"])

# Write outputs
ref_df.to_csv(OUT_CSV, index=False)
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for rec in ref_df.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"[DONE] references built:\n - {OUT_CSV}\n - {OUT_JSONL}\nRows: {len(ref_df)}")

# Quick peek
ref_df.head()


[DONE] references built:
 - ../../datasets/osdr_references.csv
 - ../../datasets/osdr_references.jsonl
Rows: 417


Unnamed: 0,paper_pmcid,paper_pmid,paper_title,paper_year,paper_url_pmc,paper_url_pubmed,dataset_accession,dataset_osdr_view,dataset_osdr_api,mission,organism,assay,project_type,data_source,evidence_type,confidence,evidence_notes,mined_mission_terms,mined_organism_terms,mined_assay_terms
0,PMC6387434,,GeneLab database analyses suggest long-term im...,2003.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://pubmed.ncbi.nlm.nih.gov/nan/,GLDS-109,https://visualization.osdr.nasa.gov/biodata/da...,https://visualization.osdr.nasa.gov/biodata/ap...,,,,,,explicit_unvalidated,0.8,Accession string found in article text,ISS;RR13538;RR15062;RR2289;RR2482;RR2598;RR310...,;mice;mouse,Microarray;RNAseq;Transcriptomic;metabolomics;...
1,PMC6387434,,GeneLab database analyses suggest long-term im...,2003.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://pubmed.ncbi.nlm.nih.gov/nan/,GLDS-117,https://visualization.osdr.nasa.gov/biodata/da...,https://visualization.osdr.nasa.gov/biodata/ap...,,,,,,explicit_unvalidated,0.8,Accession string found in article text,ISS;RR13538;RR15062;RR2289;RR2482;RR2598;RR310...,;mice;mouse,Microarray;RNAseq;Transcriptomic;metabolomics;...
2,PMC6387434,,GeneLab database analyses suggest long-term im...,2003.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://pubmed.ncbi.nlm.nih.gov/nan/,GLDS-52,https://visualization.osdr.nasa.gov/biodata/da...,https://visualization.osdr.nasa.gov/biodata/ap...,,,,,,explicit_unvalidated,0.8,Accession string found in article text,ISS;RR13538;RR15062;RR2289;RR2482;RR2598;RR310...,;mice;mouse,Microarray;RNAseq;Transcriptomic;metabolomics;...
3,PMC7072278,,NASA GeneLab platform utilized for biological ...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://pubmed.ncbi.nlm.nih.gov/nan/,GLDS-100,https://visualization.osdr.nasa.gov/biodata/da...,https://visualization.osdr.nasa.gov/biodata/ap...,,,,,,explicit_unvalidated,0.8,Accession string found in article text,Bion-M1;ISS;RR1;RR13538;RR13878;RR14222;RR1482...,;Mice;Mouse;mice;mouse;s,Microarray;Proteomic;Transcriptomic;metabolomi...
4,PMC7072278,,NASA GeneLab platform utilized for biological ...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://pubmed.ncbi.nlm.nih.gov/nan/,GLDS-111,https://visualization.osdr.nasa.gov/biodata/da...,https://visualization.osdr.nasa.gov/biodata/ap...,,,,,,explicit_unvalidated,0.8,Accession string found in article text,Bion-M1;ISS;RR1;RR13538;RR13878;RR14222;RR1482...,;Mice;Mouse;mice;mouse;s,Microarray;Proteomic;Transcriptomic;metabolomi...
