In [0]:
%python
%pip install sgp4

In [0]:
# cluster_cdm_by_orbit.py
# pip install pandas numpy matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from math import pi
from pyspark.sql import functions as F


# --------------------------
# Config
# --------------------------
# INPUT_FILE = "/mnt/data/CDM_data_profile.xlsx"
# SHEET_NAME = "Sheet1"
N_CLUSTERS = 6
RANDOM_SEED = 42
ALT_BIN_SIZE = 50  # for optional altitude binning visualization



# --------------------------
# Helpers to detect columns
# --------------------------
def pick(col_names, choices):
    for c in choices:
        if c in col_names:
            return c
        for col in col_names:
            if col.lower() == c.lower():
                return col
    return None

# --------------------------
# Load data (CDM joined with TLE rows)
# --------------------------
# Read directly from Delta
df_spark = spark.read.table("workspace.default.cdms_data_profile_v1")



# Convert to Pandas for sklearn/seaborn analysis
df = df_spark.toPandas()
cols = df.columns.tolist()

# detect columns we need (flexible)
col_norad = pick(cols, ["NORAD_CAT_ID_p", "NORAD_CAT_ID", "NORAD", "NORAD_CAT_ID_A", "NORAD_CAT_ID_B", "OBJECT_ID"])
col_alt   = pick(cols, ["ALTITUDE_KM", "altitude_km", "altitude", "ALTITUDE", "alt_km", "altitude_km"])
col_inc   = pick(cols, ["INCLINATION", "inclination", "inclination_deg", "inclination_p", "i"])
col_mm    = pick(cols, ["MEAN_MOTION", "mean_motion", "MEAN_MOTION_p", "mean_motion_p"])
col_sat1  = pick(cols, ["SAT_1_ID", "SAT1", "SAT_1", "OBJECT1_ID", "OBJECT_A", "NORAD_A", "NORAD_CAT_ID_A"])
col_sat2  = pick(cols, ["SAT_2_ID", "SAT2", "SAT_2", "OBJECT2_ID", "OBJECT_B", "NORAD_B", "NORAD_CAT_ID_B"])
col_ecc   = pick(cols, ["ECCENTRICITY", "eccentricity", "eccentricity_p", "e"])
col_tle1  = pick(cols, ["TLE_LINE1", "line1", "TLE1", "line_1"])
col_tle2  = pick(cols, ["TLE_LINE2", "line2", "TLE2", "line_2"])

print("Detected columns:")
print("  NORAD:", col_norad)
print("  ALT:", col_alt)
print("  INC:", col_inc)
print("  MEAN_MOTION:", col_mm)
print("  SAT1:", col_sat1)
print("  SAT2:", col_sat2)
print("  ECC:", col_ecc)
print("  TLE1/TLE2:", col_tle1, col_tle2)

# --------------------------
# Build a per-object table (one row per NORAD)
# --------------------------
# Some rows in df represent conjunction events between two NORADs.
# We collect unique NORAD entries and keep the first available orbital parameters.
# We'll search both SAT1 and SAT2 columns and also a single NORAD column if present.

# gather candidate NORAD ids
norad_ids = set()
if col_sat1 and col_sat2:
    norad_ids.update(df[col_sat1].dropna().astype(str).unique().tolist())
    norad_ids.update(df[col_sat2].dropna().astype(str).unique().tolist())
elif col_norad:
    norad_ids.update(df[col_norad].dropna().astype(str).unique().tolist())
else:
    # fallback: try OBJECT_ID column or index
    if "OBJECT_ID" in cols:
        norad_ids.update(df["OBJECT_ID"].dropna().astype(str).unique().tolist())

norad_ids = sorted([x for x in norad_ids if str(x).strip() != "nan"])
print(f"Found {len(norad_ids)} unique NORAD ids in CDM/TLE file sample.")

# For each NORAD, try to get altitude and inclination from any row where it appears
objects = []
for norad in norad_ids:
    # find rows where this norad appears
    mask = False
    if col_sat1 and col_sat2:
        mask = (df[col_sat1].astype(str) == str(norad)) | (df[col_sat2].astype(str) == str(norad))
    elif col_norad:
        mask = (df[col_norad].astype(str) == str(norad))
    else:
        mask = (df.get("OBJECT_ID", pd.Series(dtype=str)).astype(str) == str(norad))
    rows = df[mask]
    # pick first row that has orbital info
    alt = np.nan; inc = np.nan; ecc = np.nan; mm = np.nan; tle1 = None; tle2 = None
    for _, r in rows.iterrows():
        if pd.isna(alt) and col_alt and pd.notna(r.get(col_alt)):
            try:
                alt = float(r[col_alt])
            except:
                alt = np.nan
        if pd.isna(inc) and col_inc and pd.notna(r.get(col_inc)):
            try:
                inc = float(r[col_inc])
            except:
                inc = np.nan
        if pd.isna(ecc) and col_ecc and pd.notna(r.get(col_ecc)):
            try:
                ecc = float(r[col_ecc])
            except:
                ecc = np.nan
        if pd.isna(mm) and col_mm and pd.notna(r.get(col_mm)):
            try:
                mm = float(r[col_mm])
            except:
                mm = np.nan
        if (not tle1) and col_tle1 and pd.notna(r.get(col_tle1)):
            tle1 = r[col_tle1]
        if (not tle2) and col_tle2 and pd.notna(r.get(col_tle2)):
            tle2 = r[col_tle2]
        # break when we have alt and inc
        if (not np.isnan(alt)) and (not np.isnan(inc)):
            break

    # if altitude missing but mean motion present, compute altitude
    if np.isnan(alt) and (not np.isnan(mm)):
        # mean motion in rev/day -> period and semi-major axis -> altitude
        n = mm
        # avoid invalid
        try:
            n_rad_s = n * 2 * pi / 86400.0
            a = (398600.4418 / (n_rad_s**2)) ** (1.0/3.0)
            alt = a - 6378.137
        except:
            alt = np.nan

    objects.append({
        "NORAD": str(norad),
        "altitude_km": alt,
        "inclination_deg": inc,
        "eccentricity": ecc,
        "mean_motion": mm,
        "TLE_LINE1": tle1,
        "TLE_LINE2": tle2
    })

objects_df = pd.DataFrame(objects)

# drop rows missing altitude or inclination
objects_df_clean = objects_df.dropna(subset=["altitude_km", "inclination_deg"]).reset_index(drop=True)
print("Objects with both altitude & inclination:", len(objects_df_clean))

# --------------------------
# Compute CDM frequency per NORAD (how many CDM events mention this object)
# --------------------------
# If SAT1/SAT2 columns available, count occurrences
if col_sat1 and col_sat2:
    c1 = df[col_sat1].astype(str).value_counts()
    c2 = df[col_sat2].astype(str).value_counts()
    counts = c1.add(c2, fill_value=0)
elif col_norad:
    counts = df[col_norad].astype(str).value_counts()
else:
    counts = pd.Series(0, index=objects_df_clean["NORAD"].astype(str))

# join counts into objects_df_clean
objects_df_clean["cdm_event_count"] = objects_df_clean["NORAD"].map(lambda x: int(counts.get(str(x), 0)))

# --------------------------
# Feature matrix and clustering
# --------------------------
X = objects_df_clean[["altitude_km", "inclination_deg"]].to_numpy()
# scale features to similar ranges (simple standardization)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0) + 1e-9
Xz = (X - X_mean) / X_std

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_SEED)
labels = kmeans.fit_predict(Xz)
objects_df_clean["cluster"] = labels
centers = (kmeans.cluster_centers_ * X_std) + X_mean  # back to original scale

# compute cluster-level stats
cluster_stats = objects_df_clean.groupby("cluster").agg(
    n_objects = ("NORAD", "count"),
    total_cdm_events = ("cdm_event_count", "sum"),
    mean_alt = ("altitude_km", "mean"),
    mean_inc = ("inclination_deg", "mean")
).reset_index().sort_values("total_cdm_events", ascending=False)

print("\nCluster summary (sorted by total CDM events):")
print(cluster_stats)

# --------------------------
# Plots
# --------------------------
sns.set_style("whitegrid")

# 1) Scatter: altitude vs inclination colored by cluster, size ~ cdm_event_count
plt.figure(figsize=(12,7))
palette = sns.color_palette("tab10", N_CLUSTERS)
for c in range(N_CLUSTERS):
    sub = objects_df_clean[objects_df_clean["cluster"]==c]
    plt.scatter(sub["altitude_km"], sub["inclination_deg"],
                s = 20 + (sub["cdm_event_count"]*2),  # size scaled by events
                c = np.array([palette[c]]), label=f"cluster {c} (n={len(sub)})", alpha=0.8, edgecolors="k")
# cluster centers
for idx, (a, i) in enumerate(centers[:, :2]):
    plt.scatter(a, i, marker="X", s=150, c="black")
    plt.text(a, i, f"C{idx}", fontsize=9, fontweight="bold", ha="center", va="center", color="white")
plt.xlabel("Altitude (km)")
plt.ylabel("Inclination (deg)")
plt.title("Clusters by Altitude & Inclination (size ~ CDM event count)")
plt.legend(bbox_to_anchor=(1.05,1), loc="upper left")
plt.tight_layout()
plt.savefig("clusters_alt_inc_scatter.png")
plt.show()

# 2) Bar: total CDM events per cluster
plt.figure(figsize=(8,5))
sns.barplot(x="cluster", y="total_cdm_events", data=cluster_stats, palette="tab10")
plt.xlabel("Cluster")
plt.ylabel("Total CDM events (cluster)")
plt.title("Total CDM events per cluster")
plt.tight_layout()
plt.savefig("cluster_total_cdm_events.png")
plt.show()

# 3) Table: top NORADs by CDM count within top clusters
top_clusters = cluster_stats.head(3)["cluster"].tolist()
top_list = objects_df_clean[objects_df_clean["cluster"].isin(top_clusters)].sort_values("cdm_event_count", ascending=False).head(50)
top_list.to_csv("top_objects_by_cluster_and_cdm.csv", index=False)
print("\nSaved top objects CSV: top_objects_by_cluster_and_cdm.csv")

# 4) Heatmap: cluster vs altitude band counts (optional)
objects_df_clean["alt_band"] = pd.cut(objects_df_clean["altitude_km"], bins=np.arange(0, np.max(objects_df_clean["altitude_km"])+ALT_BIN_SIZE, ALT_BIN_SIZE))
heat = objects_df_clean.pivot_table(index="alt_band", columns="cluster", values="NORAD", aggfunc="count", fill_value=0)
plt.figure(figsize=(10,6))
sns.heatmap(heat, annot=True, fmt="d", cmap="Blues")
plt.title("Object counts per altitude band vs cluster")
plt.tight_layout()
plt.savefig("heat_altband_cluster.png")
plt.show()

# --------------------------
# Save outputs
# --------------------------
objects_df_clean.to_csv("objects_clustered_with_cdm_counts.csv", index=False)
cluster_stats.to_csv("cluster_stats_summary.csv", index=False)
print("\nSaved outputs: objects_clustered_with_cdm_counts.csv, cluster_stats_summary.csv")
