# AstroGraphAnomaly - Test maximal des capacites (Colab 2026)

Ce notebook sert a stresser au maximum la pile: chargement artefacts, cross-match SIMBAD, visualisations Plotly interactives, embeddings UMAP et t-SNE, export PNG HD via kaleido, et (optionnel) un smoke-test torch-geometric si un graphe est disponible.

Hypotheses:
- Tu as deja un run AstroGraphAnomaly qui a produit raw.csv, scored.csv, top_anomalies.csv, et idealement graph_full.graphml.
- Sinon, tu peux uploader ces fichiers ou pointer un dossier Drive.


## Installation

In [None]:
# 1) Installation des dependances (Colab)
# Note: torch-geometric peut etre fragile selon CUDA. Ce bloc essaye une install robuste.
!pip -q install --upgrade pip

# Base data + viz
!pip -q install -U pandas numpy matplotlib plotly kaleido astropy astroquery ipywidgets scikit-learn umap-learn networkx

# Torch (garde une version pip standard)
!pip -q install -U torch

# Tentative torch-geometric (best effort)
import os, sys, re, subprocess

def _run(cmd: str) -> int:
    print("+", cmd)
    return subprocess.call(cmd, shell=True)

try:
    import torch
    torch_ver = re.match(r"^(\d+\.\d+\.\d+)", torch.__version__).group(1)
    cuda_ver = torch.version.cuda
    if cuda_ver:
        cuda_tag = "cu" + cuda_ver.replace(".", "")
    else:
        cuda_tag = "cpu"

    wheel_url = f"https://data.pyg.org/whl/torch-{torch_ver}+{cuda_tag}.html"
    # Wheels pyg
    _run(f"pip -q install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv -f {wheel_url}")
    _run("pip -q install torch-geometric")
    print("torch-geometric: OK")
except Exception as e:
    print("torch-geometric: non installe (best effort). Erreur:", repr(e))


## Imports

In [None]:
# 2) Imports essentiels
import os
from zipfile import ZipFile

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

from IPython.display import display

from astropy.coordinates import SkyCoord
import astropy.units as u
from astroquery.simbad import Simbad

from sklearn.manifold import TSNE
import umap

import networkx as nx

# Optionnel: torch-geometric
try:
    import torch
    from torch_geometric.data import Data
    from torch_geometric.nn import GATConv
    HAS_PYG = True
except Exception:
    HAS_PYG = False

print("HAS_PYG =", HAS_PYG)


## Preparation

In [None]:
# 3) Dossiers de sortie
os.makedirs("outputs", exist_ok=True)
os.makedirs("screenshots", exist_ok=True)
os.makedirs("screenshots_interactive", exist_ok=True)


## Chargement des donnees

Tu as trois options:
1) Uploader raw.csv, scored.csv, top_anomalies.csv (et eventuellement graph_full.graphml) dans l'espace Colab.
2) Pointer un dossier Drive.
3) Si rien n'est dispo, le notebook tourne en mode "df_top sample" (limite) pour tester SIMBAD + 2-3 figures.


In [None]:
# Option 1: Upload manuel (decommente)
# from google.colab import files
# uploaded = files.upload()

# Option 2: Depuis Drive (decommente et adapte)
# from google.colab import drive
# drive.mount("/content/drive")
# base_dir = "/content/drive/MyDrive/ton_dossier"
# raw_path = os.path.join(base_dir, "raw.csv")
# scored_path = os.path.join(base_dir, "scored.csv")
# top_path = os.path.join(base_dir, "top_anomalies.csv")
# graph_path = os.path.join(base_dir, "graph_full.graphml")

raw_path = "raw.csv"
scored_path = "scored.csv"
top_path = "top_anomalies.csv"
graph_path = "graph_full.graphml"

df_raw = None
df_scored = None
df_top = None
graph = None

if os.path.exists(top_path):
    df_top = pd.read_csv(top_path)
    print("OK top_anomalies.csv charge:", df_top.shape)
else:
    # Mode demo rapide: top 10 exemple (a remplacer)
    top_data = [
        {"source_id": "10000000922", "ra": 266.4289451722255, "dec": -29.09800019698821, "anomaly_score": 0.7067},
        {"source_id": "10000000570", "ra": 266.115474548117,   "dec": -29.02645189019288, "anomaly_score": 0.6745},
    ]
    df_top = pd.DataFrame(top_data)
    print("ATTENTION: top_anomalies.csv absent, utilisation df_top sample")

if os.path.exists(scored_path):
    df_scored = pd.read_csv(scored_path)
    print("OK scored.csv charge:", df_scored.shape)
else:
    print("scored.csv absent")

if os.path.exists(raw_path):
    df_raw = pd.read_csv(raw_path)
    print("OK raw.csv charge:", df_raw.shape)
else:
    print("raw.csv absent")

if os.path.exists(graph_path):
    try:
        graph = nx.read_graphml(graph_path)
        print("OK graph_full.graphml charge:", graph.number_of_nodes(), "nodes,", graph.number_of_edges(), "edges")
    except Exception as e:
        print("graph_full.graphml present mais lecture impossible:", repr(e))
        graph = None
else:
    print("graph_full.graphml absent")

display(df_top.head(10))


## Cross-match SIMBAD sur top anomalies

In [None]:
Simbad.add_votable_fields("otype", "main_id", "sp_type", "flux(B)", "flux(V)")

print("Cross-match SIMBAD (rayon 10 arcsec)...")
matches = []

# On s'assure que ra/dec existent
if "ra" not in df_top.columns or "dec" not in df_top.columns:
    raise ValueError("df_top doit contenir les colonnes ra et dec (en degres).")

for _, row in df_top.iterrows():
    coord = SkyCoord(ra=float(row["ra"]) * u.deg, dec=float(row["dec"]) * u.deg)
    try:
        res = Simbad.query_region(coord, radius=10 * u.arcsec)
        if res is not None and len(res) > 0:
            # Convertit RA/DEC SIMBAD vers SkyCoord pour calcul distance
            simbad_coords = SkyCoord(res["RA"], res["DEC"], unit=(u.hourangle, u.deg))
            dists = coord.separation(simbad_coords).arcsec
            idx = int(np.argmin(dists))
            match = {
                "source_id": str(row.get("source_id", "")),
                "score": float(row.get("anomaly_score", np.nan)),
                "simbad_id": str(res["MAIN_ID"][idx]),
                "otype": str(res["OTYPE(V)"][idx]),
                "sp_type": str(res["SP_TYPE"][idx]) if "SP_TYPE" in res.colnames else "-",
                "dist_arcsec": float(np.round(dists[idx], 2)),
                "nb_matches": int(len(res)),
            }
        else:
            match = {
                "source_id": str(row.get("source_id", "")),
                "score": float(row.get("anomaly_score", np.nan)),
                "simbad_id": "",
                "otype": "No match",
                "sp_type": "",
                "dist_arcsec": np.nan,
                "nb_matches": 0,
            }
    except Exception as e:
        match = {
            "source_id": str(row.get("source_id", "")),
            "score": float(row.get("anomaly_score", np.nan)),
            "error": repr(e),
        }
    matches.append(match)

df_matches = pd.DataFrame(matches)
df_matches = df_matches.sort_values("score", ascending=False, na_position="last")
display(df_matches)

df_matches.to_csv("outputs/simbad_crossmatch.csv", index=False)
print("Ecrit: outputs/simbad_crossmatch.csv")


## Visualisations statiques et interactives (Plotly)

In [None]:
def save_fig(fig, name: str, png_dir="screenshots", html_dir="outputs", scale=3):
    png_path = os.path.join(png_dir, f"{name}.png")
    html_path = os.path.join(html_dir, f"{name}.html")
    os.makedirs(png_dir, exist_ok=True)
    os.makedirs(html_dir, exist_ok=True)

    # HTML interactif
    try:
        fig.write_html(html_path, include_plotlyjs="cdn")
    except Exception as e:
        print("write_html fail", name, ":", repr(e))

    # PNG HD via kaleido
    try:
        fig.write_image(png_path, scale=scale)
    except Exception as e:
        print("write_image fail", name, ":", repr(e))

    return png_path, html_path


# 1) Histogramme des scores
if df_scored is not None and "anomaly_score" in df_scored.columns:
    fig_histo = px.histogram(
        df_scored,
        x="anomaly_score",
        nbins=50,
        title="Distribution des scores d'anomalie",
    )
    fig_histo.update_layout(bargap=0.1)
    save_fig(fig_histo, "histo_scores")
    fig_histo.show()
else:
    print("Pas de scored.csv ou colonne anomaly_score absente: skip histo.")

# 2) Top anomalies bar
if df_top is not None and "anomaly_score" in df_top.columns:
    df_top_sorted = df_top.sort_values("anomaly_score", ascending=False).copy()
    if "source_id" not in df_top_sorted.columns:
        df_top_sorted["source_id"] = [str(i) for i in range(len(df_top_sorted))]
    fig_top = px.bar(
        df_top_sorted,
        x="source_id",
        y="anomaly_score",
        title="Top anomalies, score par source_id",
        hover_data=[c for c in df_top_sorted.columns if c not in ("anomaly_score",)],
    )
    fig_top.update_layout(xaxis_tickangle=-45)
    save_fig(fig_top, "top_anomalies_bar")
    fig_top.show()
else:
    print("df_top/anomaly_score absent: skip top bar.")

# 3) Hidden Constellations interactif (RA/DEC)
if df_top is not None and "ra" in df_top.columns and "dec" in df_top.columns:
    fig_const = px.scatter(
        df_top,
        x="ra",
        y="dec",
        color="anomaly_score" if "anomaly_score" in df_top.columns else None,
        size="anomaly_score" if "anomaly_score" in df_top.columns else None,
        hover_data=[c for c in df_top.columns],
        title="Hidden Constellations (interactive)",
    )
    fig_const.update_traces(marker=dict(line=dict(width=1)))
    save_fig(fig_const, "hidden_constellations")
    fig_const.show()
else:
    print("df_top doit contenir ra/dec: skip hidden constellations.")

# 4) Embeddings UMAP et t-SNE (si df_scored existe)
if df_scored is not None:
    # Selection features simple
    candidate_cols = ["ra", "dec", "parallax", "pmra", "pmdec", "phot_g_mean_mag", "distance"]
    cols = [c for c in candidate_cols if c in df_scored.columns]
    if len(cols) >= 2 and "anomaly_score" in df_scored.columns:
        X = df_scored[cols].fillna(0.0).to_numpy(dtype=float)
        y = df_scored["anomaly_score"].to_numpy(dtype=float)

        # UMAP
        reducer = umap.UMAP(n_components=2, random_state=42)
        emb_umap = reducer.fit_transform(X)
        df_umap = pd.DataFrame(emb_umap, columns=["dim1", "dim2"])
        df_umap["score"] = y
        fig_umap = px.scatter(
            df_umap,
            x="dim1",
            y="dim2",
            color="score",
            title="Cosmic cloud embedding (UMAP)",
        )
        save_fig(fig_umap, "umap_cosmic_cloud")
        fig_umap.show()

        # t-SNE (peut etre lent si gros N)
        n_tsne = min(len(X), 2000)
        X_small = X[:n_tsne]
        y_small = y[:n_tsne]
        tsne = TSNE(n_components=2, random_state=42, init="pca", learning_rate="auto")
        emb_tsne = tsne.fit_transform(X_small)
        df_tsne = pd.DataFrame(emb_tsne, columns=["dim1", "dim2"])
        df_tsne["score"] = y_small
        fig_tsne = px.scatter(
            df_tsne,
            x="dim1",
            y="dim2",
            color="score",
            title=f"Embedding t-SNE (n={n_tsne})",
        )
        save_fig(fig_tsne, "tsne_embedding")
        fig_tsne.show()
    else:
        print("Colonnes features insuffisantes ou anomaly_score absent: skip embeddings.")


## Graphe: degree distribution + preview interactif (si graph_full.graphml present)

In [None]:
if graph is None:
    print("Pas de graphe charge: skip graph plots.")
else:
    # Degree distribution
    degrees = [d for _, d in graph.degree()]
    df_deg = pd.DataFrame({"degree": degrees})
    fig_deg = px.histogram(df_deg, x="degree", nbins=60, title="Degree distribution (graph_full)")
    save_fig(fig_deg, "graph_degree_distribution")
    fig_deg.show()

    # Scatter: degree vs score (si mapping possible)
    if df_scored is not None and "source_id" in df_scored.columns and "anomaly_score" in df_scored.columns:
        # GraphML nodes: souvent string. On tente un alignement simple.
        node_set = set(str(n) for n in graph.nodes())
        tmp = df_scored.copy()
        tmp["node_key"] = tmp["source_id"].astype(str)
        tmp = tmp[tmp["node_key"].isin(node_set)]
        if len(tmp) > 0:
            deg_map = {str(n): int(d) for n, d in graph.degree()}
            tmp["degree"] = tmp["node_key"].map(deg_map).fillna(0).astype(int)
            fig_ds = px.scatter(tmp, x="degree", y="anomaly_score", title="Degree vs anomaly_score", hover_data=["source_id"])
            save_fig(fig_ds, "degree_vs_score")
            fig_ds.show()
        else:
            print("Aucun alignement node<->source_id trouve: skip degree_vs_score.")


## Optionnel: smoke-test torch-geometric (si installe et si graph + df_scored dispo)

In [None]:
if not HAS_PYG:
    print("torch-geometric non disponible: skip.")
elif graph is None or df_scored is None:
    print("graph ou df_scored manquant: skip.")
else:
    # Construction Data pyg simple: edge_index + features
    # Node order: list(graph.nodes())
    nodes = [str(n) for n in graph.nodes()]
    node_index = {n: i for i, n in enumerate(nodes)}

    # edge_index
    edges = []
    for u, v in graph.edges():
        su, sv = str(u), str(v)
        if su in node_index and sv in node_index:
            edges.append((node_index[su], node_index[sv]))
            edges.append((node_index[sv], node_index[su]))
    if len(edges) == 0:
        print("Aucune edge exploitable: skip.")
    else:
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

        # Features: selection simple depuis df_scored, alignement par source_id
        candidate_cols = ["ra", "dec", "parallax", "pmra", "pmdec", "phot_g_mean_mag", "distance"]
        cols = [c for c in candidate_cols if c in df_scored.columns]
        if "source_id" not in df_scored.columns or len(cols) == 0:
            print("Pas de source_id ou features: skip.")
        else:
            feat_map = {}
            for _, r in df_scored[["source_id"] + cols].iterrows():
                feat_map[str(r["source_id"])] = [float(r[c]) if pd.notna(r[c]) else 0.0 for c in cols]

            X = np.zeros((len(nodes), len(cols)), dtype=np.float32)
            missing = 0
            for n, i in node_index.items():
                if n in feat_map:
                    X[i] = np.array(feat_map[n], dtype=np.float32)
                else:
                    missing += 1
            print("Features shape:", X.shape, "missing:", missing)

            data = Data(x=torch.tensor(X), edge_index=edge_index)

            # Mini forward GAT (pas un entrainement complet)
            conv = GATConv(in_channels=data.num_features, out_channels=32, heads=2, concat=True)
            with torch.no_grad():
                out = conv(data.x, data.edge_index)
            print("GAT output:", out.shape)

            # Embedding plot (UMAP) sur un subset pour tester
            emb = out.cpu().numpy()
            n_plot = min(len(emb), 3000)
            reducer = umap.UMAP(n_components=2, random_state=42)
            emb2 = reducer.fit_transform(emb[:n_plot])
            df_gat = pd.DataFrame(emb2, columns=["dim1", "dim2"])
            fig_gat = px.scatter(df_gat, x="dim1", y="dim2", title=f"GAT embedding UMAP (n={n_plot})")
            save_fig(fig_gat, "gat_umap_embedding")
            fig_gat.show()


## Export final

In [None]:
print("Export ZIP de tous les outputs...")
zip_name = "outputs_all.zip"
if os.path.exists(zip_name):
    os.remove(zip_name)

import zipfile

with zipfile.ZipFile(zip_name, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for folder in ["outputs", "screenshots", "screenshots_interactive"]:
        if not os.path.isdir(folder):
            continue
        for root, _, files in os.walk(folder):
            for f in files:
                p = os.path.join(root, f)
                arc = p
                z.write(p, arcname=arc)

print("OK:", zip_name)

# Colab download (decommente)
# from google.colab import files
# files.download(zip_name)
