In [6]:
# --- 08 preflight: run this first ---
from pathlib import Path
import os

def find_repo_root():
    here = Path.cwd()
    # 1) prefer a parent with the expected outputs from 07
    for p in [here] + list(here.parents):
        if (p/"results/drug_repurposing/final_shortlist_ranked.csv").exists():
            return p
    # 2) fallback: git root
    for p in [here] + list(here.parents):
        if (p/".git").exists():
            return p
    # 3) last resort: any parent with results/
    for p in [here] + list(here.parents):
        if (p/"results").exists():
            return p
    return None

repo = find_repo_root()
assert repo is not None, "Could not find repo root with results/."
os.chdir(repo)
print("Working dir:", Path.cwd())


Working dir: /home/glen/scleroderma-scvi


In [7]:
# --- Robust loading for report assembly (works even if no .h5ad is present) ---
from pathlib import Path
import os
import pandas as pd

try:
    import scanpy as sc  # only needed if we actually find an .h5ad
except Exception:
    sc = None

# 0) Find repo root (folder containing results/ or data/processed/)
ROOT = Path.cwd()
for p in [ROOT] + list(ROOT.parents):
    if (p / "results").exists() or (p / "data/processed").exists():
        ROOT = p
        os.chdir(ROOT)
        break
print("Repo root:", ROOT)

OUT    = ROOT / "results/drug_repurposing"
TABLES = ROOT / "results/tables"
OUT.mkdir(parents=True, exist_ok=True)

# 1) Try to find an AnnData file, but don't crash if it's gone
proc = ROOT / "data/processed"
cands = []
if proc.exists():
    for pat in (
        "ssc_skin_scvi_annot_curated*.h5ad",
        "ssc_skin_scvi_annot*.h5ad",
        "ssc_skin_scvi*.h5ad",
        "*.h5ad",
    ):
        hits = sorted(proc.glob(pat), key=lambda p: p.stat().st_mtime, reverse=True)
        if hits:
            cands = hits
            break

if not HAVE_ADATA():
    print("[skip] adata not available; using CSV-only flow in 08.")
else:
    adata = None
    if cands and sc is not None:
        adata_path = cands[0]
        print("Using AnnData:", adata_path)
        adata = sc.read_h5ad(adata_path)
    else:
        print("⚠️ No .h5ad found under:", proc if proc.exists() else "(no data/processed/)")
        if proc.exists():
            print("data/processed contains:", [p.name for p in proc.iterdir()])

# 2) Load CSV artifacts (these drive the report)
final_csv = OUT / "final_shortlist_ranked.csv"
dose_csv  = OUT / "dose_time_consistency_summary.csv"
top15_csv = OUT / "lincs_reversal_top15_by_cluster.csv"

if not final_csv.exists():
    raise FileNotFoundError(f"Missing {final_csv}. Run the earlier 07 cells to create it.")

final = pd.read_csv(final_csv)
dose  = pd.read_csv(dose_csv)  if dose_csv.exists()  else pd.DataFrame()
top15 = pd.read_csv(top15_csv) if top15_csv.exists() else pd.DataFrame()

print(
    f"Loaded tables — final:{final.shape}  dose:{dose.shape}  top15:{top15.shape}  "
    f"AnnData present: {adata is not None}"
)


Repo root: /home/glen/scleroderma-scvi
[skip] adata not available; using CSV-only flow in 08.
Loaded tables — final:(15, 25)  dose:(15, 12)  top15:(300, 7)  AnnData present: False


In [8]:
# ---- 08: summarize + make a shareable bundle ----
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import shutil

OUT = Path("results/drug_repurposing")
BUNDLE = OUT / "final_bundle"
BUNDLE.mkdir(parents=True, exist_ok=True)

# Load the final ranked table
df = pd.read_csv(OUT / "final_shortlist_ranked.csv")

# Preview a compact set of columns (tolerant if some are missing)
cols = [c for c in [
    "base_compound","moa","status","priority_final",
    "total_score","max_rev","neglog10_best_p",
    "has_targets","fibro_selectivity"
] if c in df.columns]
display(df[cols].head(15))

# Top-12 bar chart by priority
top12 = df.sort_values("priority_final", ascending=False).head(12)
plt.figure(figsize=(8,5))
plt.barh(top12["base_compound"][::-1], top12["priority_final"][::-1])
plt.xlabel("Priority score")
plt.title("Top 12 candidates")
plt.tight_layout()
chart_path = BUNDLE / "top12_priority.png"
plt.savefig(chart_path, dpi=160)
plt.close()
print("[saved]", chart_path)

# One-pagers made earlier in 07
pages = sorted(BUNDLE.glob("onepager_*.png"))

# Minimal landing page
def rel(p): return p.relative_to(BUNDLE).as_posix()
html = [
    "<html><head><meta charset='utf-8'><title>Drug repurposing shortlist</title></head><body>",
    f"<h1>Drug repurposing shortlist</h1><p>Generated {dt.datetime.now().isoformat(timespec='minutes')}</p>",
    "<h2>Downloads</h2><ul>",
    "<li><a href='../final_shortlist_ranked.csv'>final_shortlist_ranked.csv</a></li>",
    "<li><a href='shortlist_shareable.csv'>shortlist_shareable.csv</a></li>",
    "</ul>",
    "<h2>Top 12 (priority)</h2>",
    f"<img src='{rel(chart_path)}' style='max-width:900px'>",
    "<h2>One-pagers</h2>",
]
for p in pages:
    html.append(f"<div style='margin:10px 0'><img src='{rel(p)}' style='max-width:900px'><br>{p.name}</div>")
html.append("</body></html>")
(BUNDLE / "index.html").write_text("\n".join(html))
print("[saved]", BUNDLE / "index.html")

# Zip the bundle folder for sharing
zip_path = shutil.make_archive(str(OUT / "final_bundle"), "zip", root_dir=BUNDLE)
print("[zipped]", zip_path)


Unnamed: 0,base_compound,moa,status,priority_final,total_score,max_rev,neglog10_best_p,has_targets
0,PD-0325901,MEK inhibitor,Clinical-stage,3.825211,53.157864,28.089862,28.089862,False
1,gefitinib,EGFR inhibitor,FDA-approved,2.532582,46.533121,23.742711,23.742711,False
2,WZ-3105,Unknown/other,Unknown,0.766407,18.100374,15.352157,15.352157,False
3,PD-184352,MEK inhibitor,Clinical-stage,0.553922,26.381086,22.294412,22.294412,False
4,BI-2536,PLK1 inhibitor,Clinical-stage,0.384485,25.227382,21.435435,21.435435,False
5,I-BET151,BET bromodomain inhibitor,Unknown,0.275805,21.086666,18.288711,18.288711,False
6,JNK-9L,JNK inhibitor,Unknown,0.097095,29.226276,15.105318,15.105318,False
7,I-BET,BET bromodomain inhibitor,Clinical-stage,-0.16189,22.338993,18.855543,18.855543,False
8,sirolimus,mTOR inhibitor,FDA-approved,-0.174531,26.108791,18.463558,18.463558,False
9,pelitinib,EGFR inhibitor,Unknown,-0.437989,20.454419,17.093816,17.093816,False


[saved] results/drug_repurposing/final_bundle/top12_priority.png
[saved] results/drug_repurposing/final_bundle/index.html
[zipped] /home/glen/scleroderma-scvi/results/drug_repurposing/final_bundle.zip


In [9]:
# --- Append/refresh "Provenance & reproducibility" in bundle + README ---

from pathlib import Path
import subprocess, platform, re, json, datetime as dt
import pandas as pd

# soft deps
try:
    import scanpy as sc
except Exception:
    sc = None
try:
    import scvi
except Exception:
    scvi = None
try:
    import gseapy as gp
except Exception:
    gp = None

OUT    = Path("results/drug_repurposing"); OUT.mkdir(parents=True, exist_ok=True)
BUNDLE = OUT / "final_bundle"
INDEX  = BUNDLE / "index.html"
README = OUT / "README_overview.md"
META   = Path("results/metadata"); META.mkdir(parents=True, exist_ok=True)

# --- helpers ---
def _run(*args):
    try:
        return subprocess.run(args, capture_output=True, text=True, check=False).stdout.strip() or "n/a"
    except Exception:
        return "n/a"

def pick_h5ad():
    cands = sorted(Path("data/processed").glob("ssc_skin_scvi*.h5ad"),
                   key=lambda p: p.stat().st_mtime, reverse=True)
    return cands[0].name if cands else "n/a"

# --- collect metadata ---
libs = [
    "GO_Biological_Process_2023",
    "Reactome_2022",
    "KEGG_2021_Human",
    "LINCS_L1000_Chem_Pert_up",
    "LINCS_L1000_Chem_Pert_down",
]

de_csv = "results/tables/rank_genes_groups_leiden_wilcoxon.csv"
meta = {
    "timestamp": dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "repo_tag": _run("git","describe","--tags","--abbrev=0"),
    "git_commit": _run("git","rev-parse","--short","HEAD"),
    "python": platform.python_version(),
    "scanpy": getattr(sc, "__version__", "n/a"),
    "scvi_tools": getattr(scvi, "__version__", "n/a"),
    "pandas": pd.__version__,
    "gseapy": getattr(gp, "__version__", "n/a"),
    "anndata_file": pick_h5ad(),
    "de_table": de_csv if Path(de_csv).exists() else "n/a",
    "lincs_libraries": libs,
}

html_block = f"""
<!-- PROVENANCE:BEGIN -->
<section id="provenance" style="border:1px solid #ddd;padding:12px;border-radius:8px;margin-top:12px;background:#fafafa">
  <h2 style="margin-top:0">Provenance &amp; Reproducibility</h2>
  <ul style="margin:0 0 0 18px">
    <li><b>Git:</b> tag <code>{meta['repo_tag']}</code>, commit <code>{meta['git_commit']}</code></li>
    <li><b>AnnData used:</b> <code>{meta['anndata_file']}</code></li>
    <li><b>DE table:</b> <code>{meta['de_table']}</code></li>
    <li><b>LINCS libraries:</b> {", ".join(meta["lincs_libraries"])}</li>
    <li><b>Environment:</b> Python {meta['python']}; scanpy {meta['scanpy']}; scvi-tools {meta['scvi_tools']}; pandas {meta['pandas']}; gseapy {meta['gseapy']}</li>
    <li><b>Generated:</b> {meta['timestamp']}</li>
  </ul>
  <p style="margin:10px 0 0 0">
    Reproduce: run notebooks <code>06_results.ipynb</code> → <code>07_state_signatures_and_drugs.ipynb</code> →
    <code>08_validation_and_robustness.ipynb</code>.
  </p>
</section>
<!-- PROVENANCE:END -->
""".strip()

md_block = f"""
<!-- PROVENANCE:BEGIN -->
## Provenance & reproducibility

- **Git:** tag `{meta['repo_tag']}`, commit `{meta['git_commit']}`
- **AnnData used:** `{meta['anndata_file']}`
- **DE table:** `{meta['de_table']}`
- **LINCS libraries:** {", ".join(meta["lincs_libraries"])}
- **Environment:** Python {meta['python']}; scanpy {meta['scanpy']}; scvi-tools {meta['scvi_tools']}; pandas {meta['pandas']}; gseapy {meta['gseapy']}
- **Generated:** {meta['timestamp']}

_Reproduce:_ run notebooks `06_results` → `07_state_signatures_and_drugs` → `08_validation_and_robustness`.
<!-- PROVENANCE:END -->
""".strip()

def upsert_between_markers(path: Path, begin="<!-- PROVENANCE:BEGIN -->", end="<!-- PROVENANCE:END -->", replacement=""):
    if not path.exists():
        return False
    txt = path.read_text(encoding="utf-8")
    if begin in txt and end in txt:
        new = re.sub(re.escape(begin)+r".*?"+re.escape(end),
                     replacement, txt, flags=re.S)
    else:
        new = txt.rstrip() + "\n\n" + replacement + "\n"
    path.write_text(new, encoding="utf-8")
    return True

# patch HTML bundle index (if present)
if INDEX.exists():
    upsert_between_markers(INDEX, replacement=html_block)
    print("[patched]", INDEX)
else:
    print("[note] bundle index.html not found; skipping HTML append")

# patch README_overview.md (create if missing)
if README.exists():
    upsert_between_markers(README, replacement=md_block)
    print("[patched]", README)
else:
    README.write_text(md_block + "\n", encoding="utf-8")
    print("[created]", README)

# save machine-readable provenance
(META/"provenance.json").write_text(json.dumps(meta, indent=2))
print("[saved]", META/"provenance.json")


  from .autonotebook import tqdm as notebook_tqdm


[patched] results/drug_repurposing/final_bundle/index.html
[patched] results/drug_repurposing/README_overview.md
[saved] results/metadata/provenance.json
