# AstroGraphAnomaly – Analysis to Images Report (Colab 2026)

This notebook recomputes the analyses and generates figures in the same family as your screenshots: graph views, RA/Dec maps, mean feature comparisons, top anomaly bars, CMD, region distribution, and community graphs.

It is analysis first, then images.


In [None]:
# 1) Install dependencies
!pip -q install --upgrade pandas numpy matplotlib networkx scikit-learn umap-learn ipywidgets

import os, json, math, random
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest

from IPython.display import display, HTML
print("OK: deps installed/imported")


## 2) Choose your input

You can:
- Upload a run ZIP (extract to `/content/run_input`)
- Mount Google Drive and point to a `results/<run>` directory
- Or use a local path if you already cloned the repo in Colab

Set `RUN_DIR` to the directory that contains `scored.csv`.


In [None]:
# 2) Input selection

from google.colab import files

UPLOAD_ZIP = False  # set True to upload a ZIP of a run directory
USE_DRIVE = False   # set True to mount Google Drive

RUN_DIR = None

if UPLOAD_ZIP:
    uploaded = files.upload()
    zip_name = next(iter(uploaded.keys()))
    import zipfile
    out_dir = Path("/content/run_input")
    out_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_name, "r") as z:
        z.extractall(out_dir)
    candidates = [p for p in out_dir.iterdir() if p.is_dir()]
    RUN_DIR = str(candidates[0]) if len(candidates) == 1 else str(out_dir)

if USE_DRIVE:
    from google.colab import drive
    drive.mount("/content/drive")
    # Example:
    # RUN_DIR = "/content/drive/MyDrive/AstroGraphAnomaly/results/run_csv"
    RUN_DIR = "/content/drive/MyDrive/AstroGraphAnomaly/results/run_csv"

if RUN_DIR is None:
    RUN_DIR = "results/run_csv"  # fallback

RUN_DIR = str(Path(RUN_DIR).expanduser())
print("RUN_DIR =", RUN_DIR)
assert Path(RUN_DIR).exists(), f"RUN_DIR does not exist: {RUN_DIR}"


In [None]:
# 3) Load artefacts

run_path = Path(RUN_DIR)

paths = {
    "raw": run_path / "raw.csv",
    "scored": run_path / "scored.csv",
    "top": run_path / "top_anomalies.csv",
    "enriched": run_path / "scored_enriched.csv",
    "graph": run_path / "graph_full.graphml",
}

def read_csv_if_exists(p: Path):
    if not p.exists():
        return None
    try:
        return pd.read_csv(p)
    except Exception:
        return pd.read_csv(p, low_memory=False)

df_raw = read_csv_if_exists(paths["raw"])
df_scored = read_csv_if_exists(paths["scored"])
df_top = read_csv_if_exists(paths["top"])
df_enriched = read_csv_if_exists(paths["enriched"])

print("Found:")
for k, p in paths.items():
    print(f"  {k:8s} ->", "OK" if p.exists() else "missing", str(p))

assert df_scored is not None, "scored.csv is required."

display(df_scored.head(3))


In [None]:
# 4) Build missing pieces (distance, anomaly_score, anomaly_label, top_anomalies)

def col(df: pd.DataFrame, name: str):
    if name in df.columns:
        return name
    aliases = {
        "source_id": ["source_id", "SOURCE_ID", "id"],
        "ra": ["ra", "RA"],
        "dec": ["dec", "DEC"],
        "anomaly_score": ["anomaly_score", "score", "anomalyScore"],
        "anomaly_label": ["anomaly_label", "label"],
        "bp_rp": ["bp_rp", "bp_minus_rp", "bp_rp_color"],
        "phot_g_mean_mag": ["phot_g_mean_mag", "g_mag"],
        "parallax": ["parallax"],
        "pmra": ["pmra"],
        "pmdec": ["pmdec"],
        "distance": ["distance", "dist_pc", "dist"],
        "community_id": ["community_id", "community", "louvain"],
    }
    for a in aliases.get(name, []):
        if a in df.columns:
            return a
    return None

# distance from parallax (mas)
parallax_c = col(df_scored, "parallax")
distance_c = col(df_scored, "distance")
if distance_c is None and parallax_c is not None:
    px = pd.to_numeric(df_scored[parallax_c], errors="coerce")
    dist = np.where((px > 0) & np.isfinite(px), 1000.0 / px, np.nan)
    df_scored["distance"] = dist
    distance_c = "distance"
    print("Added derived distance column from parallax.")

# anomaly_score
score_c = col(df_scored, "anomaly_score")
if score_c is None:
    print("No anomaly_score found. Computing baseline IsolationForest score.")
    feat_names = ["ra", "dec", "parallax", "pmra", "pmdec", "phot_g_mean_mag", "bp_rp", "distance"]
    feats = [col(df_scored, n) for n in feat_names]
    feats = [c for c in feats if c is not None]
    X = df_scored[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
    Xs = StandardScaler().fit_transform(X)
    iso = IsolationForest(n_estimators=400, contamination=0.05, random_state=42, n_jobs=-1)
    iso.fit(Xs)
    raw_score = -iso.decision_function(Xs)
    score = (raw_score - np.nanmin(raw_score)) / (np.nanmax(raw_score) - np.nanmin(raw_score) + 1e-12)
    df_scored["anomaly_score"] = score
    score_c = "anomaly_score"

# anomaly_label
label_c = col(df_scored, "anomaly_label")
if label_c is None:
    s = pd.to_numeric(df_scored[score_c], errors="coerce")
    thr = np.nanquantile(s.values, 0.95)
    df_scored["anomaly_label"] = np.where(s.values >= thr, -1, 1).astype(int)
    label_c = "anomaly_label"
else:
    lab = pd.to_numeric(df_scored[label_c], errors="coerce")
    if set(pd.unique(lab.dropna())).issubset({0, 1}):
        df_scored[label_c] = lab.map({0: 1, 1: -1}).astype(int)

# top anomalies
if df_top is None:
    print("top_anomalies.csv missing. Building top 30 from scored.csv")
    sid_c = col(df_scored, "source_id")
    ra_c = col(df_scored, "ra")
    dec_c = col(df_scored, "dec")
    tmp = df_scored.copy()
    tmp["_score"] = pd.to_numeric(tmp[score_c], errors="coerce")
    tmp = tmp.sort_values("_score", ascending=False).head(30)
    df_top = pd.DataFrame({
        "source_id": tmp[sid_c].astype(str).values if sid_c else tmp.index.astype(str).values,
        "ra": pd.to_numeric(tmp[ra_c], errors="coerce").values if ra_c else np.nan,
        "dec": pd.to_numeric(tmp[dec_c], errors="coerce").values if dec_c else np.nan,
        "anomaly_score": tmp["_score"].values,
    })

print("Using columns:")
print(" source_id:", col(df_scored, "source_id"))
print(" ra:", col(df_scored, "ra"))
print(" dec:", col(df_scored, "dec"))
print(" score:", score_c)
print(" label:", label_c)

display(df_top.head(5))


In [None]:
# 5) Load graph or rebuild kNN graph (deterministic)

def load_graph_graphml(p: Path):
    if not p.exists():
        return None
    try:
        return nx.read_graphml(p)
    except Exception:
        return nx.read_graphml(p, node_type=str)

G = load_graph_graphml(paths["graph"])

sid_c = col(df_scored, "source_id")
node_ids = df_scored[sid_c].astype(str).values if sid_c else df_scored.index.astype(str).values

if G is None:
    print("graph_full.graphml missing. Rebuilding kNN graph from features.")
    feat_names = ["ra", "dec", "parallax", "pmra", "pmdec", "phot_g_mean_mag", "bp_rp", "distance"]
    feats = [col(df_scored, n) for n in feat_names]
    feats = [c for c in feats if c is not None]
    X = df_scored[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
    Xs = StandardScaler().fit_transform(X)
    k = 8
    nn = NearestNeighbors(n_neighbors=k+1, metric="euclidean")
    nn.fit(Xs)
    _, idxs = nn.kneighbors(Xs)

    G = nx.Graph()
    for nid in node_ids:
        G.add_node(str(nid))
    for i, nid in enumerate(node_ids):
        for j in idxs[i][1:]:
            G.add_edge(str(nid), str(node_ids[j]))

print("Graph:", G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")

score_series = pd.to_numeric(df_scored[score_c], errors="coerce").values
score_map = {str(n): float(s) for n, s in zip(node_ids, score_series) if np.isfinite(s)}


In [None]:
# 6) Communities (use enriched if present, else compute)

community_map = None

if df_enriched is not None:
    sid_e = col(df_enriched, "source_id")
    cid_e = col(df_enriched, "community_id")
    if sid_e and cid_e:
        sids = df_enriched[sid_e].astype(str).values
        cids = pd.to_numeric(df_enriched[cid_e], errors="coerce").values
        m = np.isfinite(cids)
        if m.sum() > 0:
            community_map = {str(s): int(c) for s, c in zip(sids[m], cids[m])}
            print("Using community_id from scored_enriched.csv")

if community_map is None:
    print("Computing communities from graph.")
    comms = None
    try:
        comms = nx.algorithms.community.louvain_communities(G, seed=42)
    except Exception:
        try:
            comms = list(nx.algorithms.community.greedy_modularity_communities(G))
        except Exception:
            comms = None
    if comms is None:
        community_map = {str(n): 0 for n in G.nodes()}
    else:
        community_map = {}
        for i, comm in enumerate(comms):
            for n in comm:
                community_map[str(n)] = int(i)

print("Communities:", len(set(community_map.values())))


## 7) Generate the figures

This section recomputes and exports the plots. Output folder:

`<RUN_DIR>/image_report/`

It also builds a simple `index.html` gallery for convenience.


In [None]:
# 7) Plot suite

out_dir = run_path / "image_report"
out_dir.mkdir(parents=True, exist_ok=True)

generated = []

def save_fig(filename: str, title: str = ""):
    if title:
        plt.title(title)
    plt.tight_layout()
    p = out_dir / filename
    plt.savefig(p, dpi=220, bbox_inches="tight")
    plt.close()
    generated.append(filename)
    return p

def plot_graph_nodes_by_score():
    pos = nx.spring_layout(G, seed=42, iterations=70)
    plt.figure(figsize=(10, 10))
    edges = list(G.edges())
    if len(edges) > 6000:
        random.Random(42).shuffle(edges)
        edges = edges[:6000]
    for u, v in edges:
        x1, y1 = pos[u]; x2, y2 = pos[v]
        plt.plot([x1, x2], [y1, y2], linewidth=0.2, alpha=0.15)

    xs, ys, cs = [], [], []
    for n in G.nodes():
        x, y = pos[n]
        xs.append(x); ys.append(y)
        cs.append(score_map.get(str(n), np.nan))
    cs = np.array(cs, dtype=float)
    med = np.nanmedian(cs) if not np.isnan(cs).all() else 0.0
    cs = np.where(np.isnan(cs), med, cs)

    sc = plt.scatter(xs, ys, c=cs, s=12, cmap="viridis", alpha=0.95)
    plt.axis("off")
    plt.colorbar(sc, label="Anomaly Score")
    save_fig("01_graph_nodes_by_score.png", "Visualization of the Generated Graph (Nodes colored by Score)")

def plot_ra_dec_score():
    ra_c = col(df_scored, "ra"); dec_c = col(df_scored, "dec")
    if not ra_c or not dec_c:
        return
    ra = pd.to_numeric(df_scored[ra_c], errors="coerce")
    dec = pd.to_numeric(df_scored[dec_c], errors="coerce")
    score = pd.to_numeric(df_scored[score_c], errors="coerce")
    m = np.isfinite(ra.values) & np.isfinite(dec.values) & np.isfinite(score.values)
    if m.sum() == 0:
        return
    plt.figure(figsize=(12, 7))
    sc = plt.scatter(ra[m], dec[m], c=score[m], s=120, cmap="viridis", alpha=0.75)
    plt.xlabel("Right Ascension (RA)")
    plt.ylabel("Declination (Dec)")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.colorbar(sc, label="Anomaly Score")
    save_fig("02_ra_dec_score.png", "Spatial Distribution of Nodes (RA vs Dec) colored by Score")

def plot_mean_features():
    feat_names = ["phot_g_mean_mag", "bp_rp", "parallax", "pmra", "pmdec"]
    cols = [col(df_scored, n) for n in feat_names]
    cols = [c for c in cols if c is not None]
    if not cols:
        return
    tmp = df_scored.copy()
    tmp["_lab"] = pd.to_numeric(tmp[label_c], errors="coerce")
    for c in cols:
        tmp[c] = pd.to_numeric(tmp[c], errors="coerce")
    anom = tmp[tmp["_lab"] == -1][cols].mean(numeric_only=True)
    norm = tmp[tmp["_lab"] != -1][cols].mean(numeric_only=True)

    x = np.arange(len(cols))
    w = 0.38
    plt.figure(figsize=(12, 6))
    plt.bar(x - w/2, anom.values, w, label="Anomalous")
    plt.bar(x + w/2, norm.values, w, label="Normal")
    plt.xticks(x, cols)
    plt.ylabel("Mean Value")
    plt.legend()
    plt.grid(True, axis="y", linestyle="--", alpha=0.35)
    save_fig("03_mean_features_anom_vs_normal.png", "Comparison of Mean Feature Values: Anomalous vs. Normal")

def plot_top_bar():
    sid = col(df_top, "source_id"); sc = col(df_top, "anomaly_score")
    if not sid or not sc:
        return
    tmp = df_top.copy()
    tmp["source_id"] = tmp[sid].astype(str)
    tmp["anomaly_score"] = pd.to_numeric(tmp[sc], errors="coerce")
    tmp = tmp.sort_values("anomaly_score", ascending=False).head(30)
    plt.figure(figsize=(18, 6))
    plt.bar(tmp["source_id"].values, tmp["anomaly_score"].values)
    plt.xticks(rotation=90)
    plt.xlabel("Source ID (Anomalous Candidates)")
    plt.ylabel("Anomaly Score")
    plt.grid(True, axis="y", linestyle="--", alpha=0.35)
    save_fig("04_top_anomalies_bar.png", "Anomaly Score per Anomalous Source ID")

def plot_knn_subgraph():
    sid = col(df_top, "source_id")
    sc = col(df_top, "anomaly_score")
    if not sid:
        return
    top_nodes = [str(x) for x in df_top[sid].astype(str).values[:30]]
    keep = set()
    for n in top_nodes:
        if n in G:
            keep.add(n)
            keep.update(list(G.neighbors(n)))
    if not keep:
        return
    sg = G.subgraph(list(keep)).copy()
    pos = nx.spring_layout(sg, seed=42, iterations=80)
    plt.figure(figsize=(9, 9))
    for u, v in sg.edges():
        x1, y1 = pos[u]; x2, y2 = pos[v]
        plt.plot([x1, x2], [y1, y2], linewidth=0.8, alpha=0.6)
    score_local = {}
    if sc:
        sids = df_top[sid].astype(str).values
        scs = pd.to_numeric(df_top[sc], errors="coerce").values
        for s, v in zip(sids, scs):
            if np.isfinite(v):
                score_local[str(s)] = float(v)
    xs, ys, cs = [], [], []
    for n in sg.nodes():
        x, y = pos[n]; xs.append(x); ys.append(y)
        cs.append(score_local.get(str(n), score_map.get(str(n), np.nan)))
    cs = np.array(cs, dtype=float)
    med = np.nanmedian(cs) if not np.isnan(cs).all() else 0.0
    cs = np.where(np.isnan(cs), med, cs)
    scp = plt.scatter(xs, ys, c=cs, s=90, cmap="viridis")
    plt.axis("off")
    plt.colorbar(scp, label="Anomaly Score")
    save_fig("05_graph_anomalies_knn.png", "Graphe Anomalies (k-NN)")

def plot_top_position():
    ra = col(df_top, "ra"); dec = col(df_top, "dec"); sc = col(df_top, "anomaly_score")
    if not ra or not dec or not sc:
        return
    r = pd.to_numeric(df_top[ra], errors="coerce")
    d = pd.to_numeric(df_top[dec], errors="coerce")
    s = pd.to_numeric(df_top[sc], errors="coerce")
    m = np.isfinite(r.values) & np.isfinite(d.values) & np.isfinite(s.values)
    if m.sum() == 0:
        return
    plt.figure(figsize=(8, 6))
    scp = plt.scatter(r[m], d[m], c=s[m], s=60, cmap="viridis")
    plt.xlabel("RA (deg)")
    plt.ylabel("Dec (deg)")
    plt.colorbar(scp, label="anomaly_score")
    save_fig("06_top_anomalies_position_score.png", "Top anomalies: position et score")

def plot_cmd():
    bp = col(df_scored, "bp_rp")
    g = col(df_scored, "phot_g_mean_mag")
    if not bp or not g:
        return
    bpv = pd.to_numeric(df_scored[bp], errors="coerce")
    gv = pd.to_numeric(df_scored[g], errors="coerce")
    m = np.isfinite(bpv.values) & np.isfinite(gv.values)
    if m.sum() == 0:
        return
    plt.figure(figsize=(10, 7))
    plt.scatter(bpv[m], gv[m], s=6, alpha=0.6)
    plt.gca().invert_yaxis()
    plt.xlabel("BP - RP Color [mag]")
    plt.ylabel("G-band Magnitude [mag]")
    plt.grid(True, linestyle="--", alpha=0.35)
    save_fig("07_cmd_bp_rp_vs_g.png", "Diagramme Couleur-Magnitude Gaia (BP-RP vs G)")

def plot_region_distribution():
    if df_raw is None:
        return
    ra = col(df_raw, "ra"); dec = col(df_raw, "dec")
    if not ra or not dec:
        return
    r = pd.to_numeric(df_raw[ra], errors="coerce")
    d = pd.to_numeric(df_raw[dec], errors="coerce")
    m = np.isfinite(r.values) & np.isfinite(d.values)
    if m.sum() == 0:
        return
    plt.figure(figsize=(10, 6))
    plt.scatter(r[m], d[m], s=6, alpha=0.7)
    plt.xlabel("Right Ascension (RA) [deg]")
    plt.ylabel("Declination (Dec) [deg]")
    plt.grid(True, linestyle="--", alpha=0.35)
    save_fig("08_region_distribution.png", "Distribution des étoiles Gaia dans la région sélectionnée")

def plot_graph_by_community():
    pos = nx.spring_layout(G, seed=42, iterations=70)
    plt.figure(figsize=(11, 11))
    edges = list(G.edges())
    if len(edges) > 6000:
        random.Random(42).shuffle(edges)
        edges = edges[:6000]
    for u, v in edges:
        x1, y1 = pos[u]; x2, y2 = pos[v]
        plt.plot([x1, x2], [y1, y2], linewidth=0.2, alpha=0.15)
    xs, ys, cs = [], [], []
    for n in G.nodes():
        x, y = pos[n]; xs.append(x); ys.append(y)
        cs.append(float(community_map.get(str(n), -1)))
    scp = plt.scatter(xs, ys, c=np.array(cs), s=60, cmap="tab20", alpha=0.95)
    plt.axis("off")
    plt.colorbar(scp, label="Community ID")
    save_fig("09_graph_anomalies_by_community.png", "Graphe Anomalies par Communauté (k-NN)")

def plot_score_hist():
    s = pd.to_numeric(df_scored[score_c], errors="coerce").dropna()
    if len(s) == 0:
        return
    plt.figure(figsize=(10, 6))
    plt.hist(s.values, bins=50)
    plt.xlabel("Anomaly Score")
    plt.ylabel("Count")
    plt.grid(True, linestyle="--", alpha=0.35)
    save_fig("10_score_hist.png", "Distribution des scores d'anomalie")

# Run plots
plot_graph_nodes_by_score()
plot_ra_dec_score()
plot_mean_features()
plot_top_bar()
plot_knn_subgraph()
plot_top_position()
plot_cmd()
plot_region_distribution()
plot_graph_by_community()
plot_score_hist()

print("Generated:", len(generated))
for n in generated:
    print(" -", n)

# Build simple HTML index
cards = "".join([f'<div class="card"><img src="{n}"/><div class="cap">{n}</div></div>' for n in generated])
html = f'''<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>AstroGraphAnomaly Image Report</title>
<style>
body{{font-family:Arial,sans-serif;margin:24px}}
.grid{{display:grid;grid-template-columns:repeat(auto-fit,minmax(280px,1fr));gap:14px}}
.card{{border:1px solid #ddd;border-radius:10px;padding:10px}}
.card img{{width:100%;height:auto;border-radius:8px}}
.cap{{font-size:12px;color:#333;margin-top:6px;word-break:break-all}}
</style>
</head>
<body>
<h1>AstroGraphAnomaly Image Report</h1>
<p>Run dir: <code>{run_path}</code></p>
<div class="grid">{cards}</div>
</body>
</html>'''
(out_dir / "index.html").write_text(html, encoding="utf-8")
display(HTML(html))


In [None]:
# 8) Export as ZIP (images + index.html)

import shutil
zip_path = Path("/content") / "astrograph_image_report.zip"
if zip_path.exists():
    zip_path.unlink()
shutil.make_archive(str(zip_path).replace(".zip",""), "zip", out_dir)
print("Wrote:", zip_path)

from google.colab import files
files.download(str(zip_path))
