In [3]:
# === NASA SMAP Anomaly Detection (Colab + Context-Aware Visuals + Drive Integration) ===

!pip install kagglehub tensorflow matplotlib scikit-learn --quiet

# --- Mount Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Imports ---
import json
import ast
from pathlib import Path
from typing import Any, Dict, List, Tuple
import kagglehub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

# --- Paths ---
BASE_DIR = Path("/content/drive/MyDrive/General Satellite Anomaly Prediction")
RESULTS_DIR = BASE_DIR / "results"
MODEL_PATH = BASE_DIR / "smap_anomaly_detector.h5"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# --- Config ---
np.random.seed(42)
tf.random.set_seed(42)

WINDOW_SIZE = 72
STEP_SIZE = 4
EPOCHS = 40      # can go longer if GPU available
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.1

plt.style.use("dark_background")

# --- Download NASA SMAP Data ---
print("[INFO] Downloading NASA SMAP dataset via KaggleHub...")
path = Path(kagglehub.dataset_download("patrickfleith/nasa-anomaly-detection-dataset-smap-msl"))
print("✅ Dataset path:", path)

# --- Helpers ---
def _parse_anomaly_sequences(sequence_str: Any, series_length: int):
    if isinstance(sequence_str, float) and np.isnan(sequence_str):
        return []
    try:
        raw = ast.literal_eval(str(sequence_str))
    except Exception:
        return []
    seqs = []
    for pair in raw:
        if isinstance(pair, (list, tuple)) and len(pair) == 2:
            s, e = max(0, int(pair[0])), min(series_length - 1, int(pair[1]))
            if e >= s:
                seqs.append((s, e))
    return seqs

def _interpolate_array(values: np.ndarray) -> np.ndarray:
    df = pd.DataFrame(values)
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.interpolate(limit_direction="both").fillna(0)
    return df.to_numpy(np.float32)

# --- Load Only SMAP Channels ---
labels = pd.read_csv(path / "labeled_anomalies.csv")
labels["spacecraft"] = labels["spacecraft"].str.upper()
smap = labels[labels["spacecraft"] == "SMAP"]

train_dir = path / "data" / "data" / "train"
test_dir = path / "data" / "data" / "test"

train_data, test_data, anomaly_sequences = {}, {}, {}

for _, row in smap.iterrows():
    ch = str(row["chan_id"])
    train_file = train_dir / f"{ch}.npy"
    test_file = test_dir / f"{ch}.npy"
    if train_file.exists() and test_file.exists():
        train_data[ch] = np.load(train_file)
        test_data[ch] = np.load(test_file)
        anomaly_sequences[ch] = _parse_anomaly_sequences(row["anomaly_sequences"], len(test_data[ch]))

print(f"[INFO] Loaded {len(train_data)} SMAP channels.")

# --- Preprocessing ---
train_clean = {k: _interpolate_array(v) for k, v in train_data.items()}
test_clean = {k: _interpolate_array(v) for k, v in test_data.items()}

scaler = StandardScaler()
scaler.fit(np.vstack(list(train_clean.values())))
train_scaled = {k: scaler.transform(v) for k, v in train_clean.items()}
test_scaled = {k: scaler.transform(v) for k, v in test_clean.items()}

def create_windows(data, labels, window_size, step_size):
    seqs, labs = [], []
    for start in range(0, len(data) - window_size, step_size):
        end = start + window_size
        seqs.append(data[start:end])
        labs.append(1 if labels[start:end].max() > 0 else 0)
    return np.array(seqs), np.array(labs)

train_seq, test_seq, test_labels, channel_boundaries = [], [], [], []
offset = 0

for ch, x_train in train_scaled.items():
    y_test = np.zeros(len(test_scaled[ch]))
    for s, e in anomaly_sequences.get(ch, []):
        y_test[s:e] = 1
    t_seq, _ = create_windows(x_train, np.zeros(len(x_train)), WINDOW_SIZE, STEP_SIZE)
    te_seq, te_lab = create_windows(test_scaled[ch], y_test, WINDOW_SIZE, STEP_SIZE)
    train_seq.append(t_seq)
    test_seq.append(te_seq)
    test_labels.append(te_lab)
    channel_boundaries.append((ch, offset, offset + len(te_seq)))
    offset += len(te_seq)

train_seq = np.concatenate(train_seq)
test_seq = np.concatenate(test_seq)
test_labels = np.concatenate(test_labels)

print(f"[INFO] Train shape: {train_seq.shape}, Test shape: {test_seq.shape}")

# --- Model ---
def build_model(input_shape):
    i = Input(shape=input_shape)
    x = LSTM(64, return_sequences=True)(i)
    x = LSTM(32, return_sequences=False)(x)
    x = RepeatVector(input_shape[0])(x)
    x = LSTM(32, return_sequences=True)(x)
    o = TimeDistributed(Dense(input_shape[1]))(x)
    model = Model(i, o)
    model.compile(optimizer="adam", loss="mse")
    return model

model = build_model((train_seq.shape[1], train_seq.shape[2]))
model.summary()

cb = [EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)]
history = model.fit(
    train_seq, train_seq,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    verbose=1,
    callbacks=cb
)

# --- Inference + Metrics ---
train_recon = model.predict(train_seq, verbose=0)
test_recon = model.predict(test_seq, verbose=0)
train_err = np.mean(np.square(train_seq - train_recon), axis=(1,2))
test_err = np.mean(np.square(test_seq - test_recon), axis=(1,2))

thr = train_err.mean() + 3*train_err.std()
pred_labels = (test_err > thr).astype(int)

prec, rec, f1, _ = precision_recall_fscore_support(test_labels, pred_labels, average="binary", zero_division=0)
roc = roc_auc_score(test_labels, test_err)

print(f"[RESULT] Threshold: {thr:.4f}")
print(f"[RESULT] Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}, ROC-AUC: {roc:.3f}")

# --- Save Model ---
model.save(MODEL_PATH)
print(f"[INFO] Saved model weights to: {MODEL_PATH}")

# --- Visualization Section ---
plt.figure(figsize=(8,5))
plt.plot(history.history["loss"], label="Train", color="#00FFFF")
plt.plot(history.history["val_loss"], label="Val", color="#FF66CC")
plt.title("Training Loss (MSE)")
plt.legend(); plt.grid(alpha=0.3)
plt.savefig(RESULTS_DIR / "training_loss_plot.png", dpi=300, bbox_inches="tight")
plt.close()

plt.figure(figsize=(8,5))
plt.hist(train_err, bins=40, color="#00eaff", alpha=0.75)
plt.axvline(thr, color="#ffd700", linestyle="--", label="Threshold")
plt.title("Training Reconstruction Error Distribution")
plt.legend(); plt.grid(alpha=0.3)
plt.savefig(RESULTS_DIR / "reconstruction_error_hist.png", dpi=300, bbox_inches="tight")
plt.close()

# Context-aware anomaly timeline
plt.figure(figsize=(12,5))
plt.plot(test_err, color="#00ffc6", label="Reconstruction Error")
plt.axhline(thr, color="#ffd700", linestyle="--", label="Threshold")
plt.scatter(np.where(pred_labels==1), test_err[pred_labels==1], color="#ff4d6d", s=10, label="Anomaly")

# Add SMAP channel markers
for ch, start, end in channel_boundaries:
    plt.axvline(end, color="#222222", linestyle=":", linewidth=0.7)
    plt.text((start + end)/2, max(test_err)*0.98, f"{ch}", ha="center", va="top", fontsize=8, color="#cccccc")

plt.title("Anomaly Timeline Across SMAP Sensor Channels")
plt.xlabel("Window index (segmented per channel)")
plt.ylabel("Reconstruction Error")
plt.legend(); plt.grid(alpha=0.3)
plt.savefig(RESULTS_DIR / "anomaly_timeline_plot.png", dpi=300, bbox_inches="tight")
plt.close()

print(f"[INFO] Visualizations saved to: {RESULTS_DIR}")
print("[✅] Pipeline complete.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] Downloading NASA SMAP dataset via KaggleHub...
Using Colab cache for faster access to the 'nasa-anomaly-detection-dataset-smap-msl' dataset.
✅ Dataset path: /kaggle/input/nasa-anomaly-detection-dataset-smap-msl
[INFO] Loaded 54 SMAP channels.
[INFO] Train shape: (33545, 72, 25), Test shape: (108003, 72, 25)


Epoch 1/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - loss: 0.8041 - val_loss: 1.5675
Epoch 2/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.7450 - val_loss: 1.5396
Epoch 3/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.7074 - val_loss: 1.5541
Epoch 4/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.6985 - val_loss: 1.5411
Epoch 5/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 0.6775 - val_loss: 1.5301
Epoch 6/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - loss: 0.6640 - val_loss: 1.5335
Epoch 7/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - loss: 0.6672 - val_loss: 1.5253
Epoch 8/40
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - loss: 0.6485 - val_loss: 1.5219
Epoch 9/40
[1m472/472[0m [3



[RESULT] Threshold: 12.7574
[RESULT] Precision: 0.751, Recall: 0.021, F1: 0.041, ROC-AUC: 0.415
[INFO] Saved model weights to: /content/drive/MyDrive/General Satellite Anomaly Prediction/smap_anomaly_detector.h5
[INFO] Visualizations saved to: /content/drive/MyDrive/General Satellite Anomaly Prediction/results
[✅] Pipeline complete.


In [4]:
# === Enhanced Visualizations (Dark Cosmic Aesthetic) ===

plt.style.use("dark_background")

# 1️⃣ LOSS CURVES
fig, ax = plt.subplots(figsize=(9,5), dpi=180)
ax.plot(history.history["loss"], label="Training", color="#00FFFF", lw=2)
ax.plot(history.history["val_loss"], label="Validation", color="#FF66CC", lw=2, ls="--")
ax.set_title("Model Learning Curve — SMAP Telemetry Reconstruction", fontsize=13, color="white")
ax.set_xlabel("Epoch"); ax.set_ylabel("MSE Loss")
ax.legend(); ax.grid(alpha=0.3)
fig.savefig(RESULTS_DIR / "training_loss_plot.png", dpi=300, bbox_inches="tight")
plt.close(fig)

# 2️⃣ RECONSTRUCTION ERROR HISTOGRAM
fig, ax = plt.subplots(figsize=(9,5), dpi=180)
ax.hist(train_err, bins=50, color="#00eaff", alpha=0.7, label="Train Error")
ax.axvline(thr, color="#FFD700", lw=2, ls="--", label="Threshold")
ax.set_title("Distribution of Reconstruction Errors", color="white")
ax.set_xlabel("Reconstruction Error"); ax.set_ylabel("Frequency")
ax.legend(); ax.grid(alpha=0.3)
fig.savefig(RESULTS_DIR / "reconstruction_error_hist.png", dpi=300, bbox_inches="tight")
plt.close(fig)

# 3️⃣ RECONSTRUCTION OVERLAY
sample_idx = np.argmax(test_err)  # most anomalous window
recon_err_seq = test_err[sample_idx]
actual_seq = test_seq[sample_idx]
recon_seq = test_recon[sample_idx]
plt.figure(figsize=(9,5), dpi=180)
plt.plot(actual_seq[:,0], color="#FFD700", lw=2, label="Actual Signal (feature_0)")
plt.plot(recon_seq[:,0], color="#FF47FF", lw=2, ls="--", label="Reconstruction")
plt.title(f"Most Anomalous Sequence (window {sample_idx})", color="white")
plt.legend(); plt.grid(alpha=0.3)
plt.savefig(RESULTS_DIR / "anomaly_overlay_plot.png", dpi=300, bbox_inches="tight")
plt.close()

# 4️⃣ ANOMALY TIMELINE (Sensor-labeled)
fig, ax = plt.subplots(figsize=(12,5), dpi=180)
ax.plot(test_err, color="#00ffc6", lw=1.8, label="Reconstruction Error")
ax.axhline(thr, color="#ffd700", ls="--", lw=2, label="Threshold")
ax.scatter(np.where(pred_labels==1), test_err[pred_labels==1], color="#FF4D6D", s=10, label="Detected Anomaly")

for ch, start, end in channel_boundaries:
    ax.axvline(end, color="#222222", lw=0.7, ls=":")
    ax.text((start+end)/2, max(test_err)*0.95, f"{ch}", ha="center", va="top", fontsize=7, color="#bbbbbb")

ax.set_title("Anomaly Timeline Across SMAP Sensor Channels", color="white")
ax.set_xlabel("Sliding Window Index (per channel)")
ax.set_ylabel("Reconstruction Error")
ax.legend(); ax.grid(alpha=0.3)
fig.savefig(RESULTS_DIR / "anomaly_timeline_plot.png", dpi=300, bbox_inches="tight")
plt.close(fig)

# 5️⃣ ANOMALY DENSITY HEATMAP
import seaborn as sns
plt.figure(figsize=(10,4), dpi=180)
heat_data = pd.DataFrame({
    "channel": np.repeat([c for c,_,_ in channel_boundaries], [e-s for _,s,e in channel_boundaries]),
    "error": test_err[:sum([e-s for _,s,e in channel_boundaries])],
    "anomaly": pred_labels[:sum([e-s for _,s,e in channel_boundaries])]
})
pivot = heat_data.pivot_table(index="channel", values="error", aggfunc="mean").sort_values("error", ascending=False)
sns.heatmap(pivot, cmap="mako", cbar_kws={'label': 'Mean Reconstruction Error'})
plt.title("Average Anomaly Intensity by SMAP Channel", color="white")
plt.savefig(RESULTS_DIR / "anomaly_density_heatmap.png", dpi=300, bbox_inches="tight")
plt.close()

print(f"[INFO] Enhanced visualizations saved to {RESULTS_DIR}")


[INFO] Enhanced visualizations saved to /content/drive/MyDrive/General Satellite Anomaly Prediction/results


In [5]:
# === Scope 8: Compute Sensor Correlation Matrix for SMAP Channels ===

import seaborn as sns

print("[INFO] Generating sensor correlation matrix...")

# Merge all SMAP training data into one big DataFrame for correlation analysis
merged_df = pd.DataFrame()

for ch, arr in train_clean.items():
    df_temp = pd.DataFrame(arr, columns=[f"{ch}_f{i}" for i in range(arr.shape[1])])
    merged_df = pd.concat([merged_df, df_temp], axis=1)

corr = merged_df.corr()

# Save correlation matrix to Drive for further exploration
corr_path = RESULTS_DIR / "smap_sensor_correlation.csv"
corr.to_csv(corr_path, index=True)
print(f"[INFO] Saved correlation matrix to: {corr_path}")


[INFO] Generating sensor correlation matrix...
[INFO] Saved correlation matrix to: /content/drive/MyDrive/General Satellite Anomaly Prediction/results/smap_sensor_correlation.csv


In [6]:
# === Scope 9: Visualize SMAP Sensor Correlation Heatmap ===

plt.style.use("dark_background")
plt.figure(figsize=(12, 10), dpi=200)

sns.heatmap(
    corr,
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"label": "Correlation"},
    xticklabels=False,  # Hide individual feature labels if too many
    yticklabels=False,
)

plt.title("Inter-Sensor Correlation Heatmap — NASA SMAP Telemetry", fontsize=14, color="white")
plt.xlabel("Sensor Features (Channels × Variables)")
plt.ylabel("Sensor Features (Channels × Variables)")
plt.tight_layout()
plt.savefig(RESULTS_DIR / "smap_correlation_heatmap.png", dpi=300, bbox_inches="tight")
plt.close()

print(f"[INFO] Correlation heatmap saved to: {RESULTS_DIR}")


[INFO] Correlation heatmap saved to: /content/drive/MyDrive/General Satellite Anomaly Prediction/results


HEATMAP


In [7]:
# === Scope A: Build channel name mapping from anomalies CSV & docs ===

labels = pd.read_csv(path / "labeled_anomalies.csv")
labels["spacecraft"] = labels["spacecraft"].str.upper()
smap_labels = labels[labels["spacecraft"] == "SMAP"]

# Map chan_id → description (if known)
chan_name_map = {}
for _, row in smap_labels.iterrows():
    cid = str(row["chan_id"])
    # Try to assign known names if present (fallback to id)
    # Example mapping from SMAP spec:
    if cid in chan_name_map:
        continue
    # basic fallback
    chan_name_map[cid] = f"chan_{cid}"

# Example: You may override a few manually based on SMAP spec:
# chan_name_map['1'] = "voltage_sensors_dn"
# chan_name_map['2'] = "loop_back_noise_only_h_dn"
# etc.

print("Channel name map (sample):", dict(list(chan_name_map.items())[:5]))


Channel name map (sample): {'P-1': 'chan_P-1', 'S-1': 'chan_S-1', 'E-1': 'chan_E-1', 'E-2': 'chan_E-2', 'E-3': 'chan_E-3'}


In [9]:
# === Scope B (Fixed): Compute correlation across SMAP channels ===
import seaborn as sns

print("[INFO] Building channel-level correlation (auto-aligned by length)")

# Compute mean feature per timestep for each channel
ch_means = {ch: np.nanmean(arr, axis=1) for ch, arr in train_clean.items()}

# Find smallest common length (truncate longer ones)
min_len = min(len(v) for v in ch_means.values())
aligned = {ch: v[:min_len] for ch, v in ch_means.items()}

# Combine into DataFrame for correlation
corr_df = pd.DataFrame(aligned)
corr_matrix = corr_df.corr()

# Save CSV
corr_csv = RESULTS_DIR / "channel_corr_matrix.csv"
corr_matrix.to_csv(corr_csv, index=True)
print(f"[INFO] Saved channel-level correlation CSV: {corr_csv}")

# Optional quick peek
display(corr_matrix.head())


[INFO] Building channel-level correlation (auto-aligned by length)
[INFO] Saved channel-level correlation CSV: /content/drive/MyDrive/General Satellite Anomaly Prediction/results/channel_corr_matrix.csv


Unnamed: 0,P-1,S-1,E-1,E-2,E-3,E-4,E-5,E-6,E-7,E-8,...,G-7,P-7,R-1,A-5,A-6,A-7,D-13,A-8,A-9,F-3
P-1,1.0,-0.08512,-0.139522,-0.109649,-0.045801,-0.101361,-0.133765,-0.112634,-0.086523,-0.077531,...,,-0.218485,-0.18858,0.082429,0.093018,0.196101,,-0.0407,0.004842,-0.108805
S-1,-0.08512,1.0,0.539354,0.472928,0.443034,0.507136,0.483841,0.526651,0.39799,0.421281,...,,0.091622,0.000844,0.142587,0.185814,-0.033411,,-0.101552,-0.005898,0.57696
E-1,-0.139522,0.539354,1.0,0.920553,0.778869,0.907338,0.893695,0.891278,0.354852,0.865191,...,,0.277496,0.153867,0.154207,0.192302,-0.034962,,0.054238,-0.041397,0.87882
E-2,-0.109649,0.472928,0.920553,1.0,0.818589,0.847231,0.860562,0.877868,0.309851,0.890124,...,,0.306262,0.183161,0.141782,0.145514,-0.004901,,0.101751,-0.055191,0.831005
E-3,-0.045801,0.443034,0.778869,0.818589,1.0,0.769998,0.799799,0.786943,0.298309,0.810416,...,,0.20813,0.099275,0.078296,0.130201,0.012411,,0.073232,-0.020084,0.774279


In [10]:
plt.style.use("dark_background")
plt.figure(figsize=(9,7), dpi=200)
sns.heatmap(
    corr_matrix,
    cmap="coolwarm",
    center=0,
    cbar_kws={"label": "Correlation"},
    xticklabels=[chan_name_map.get(ch, ch) for ch in corr_matrix.columns],
    yticklabels=[chan_name_map.get(ch, ch) for ch in corr_matrix.index],
)
plt.title("Correlation Among SMAP Telemetry Channels", fontsize=14, color="white")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(RESULTS_DIR / "smap_channel_correlation_heatmap.png", dpi=300, bbox_inches="tight")
plt.close()
print(f"[INFO] Channel correlation heatmap saved: {RESULTS_DIR}")


[INFO] Channel correlation heatmap saved: /content/drive/MyDrive/General Satellite Anomaly Prediction/results


In [11]:
# === Scope: Compute & Plot Channel Correlation Heatmap ===

import seaborn as sns
import matplotlib.pyplot as plt

print("[INFO] Generating channel-level correlation heatmap with labels…")

# 1. Compute per-channel mean (over features) and align lengths
ch_means = {ch: np.nanmean(arr, axis=1) for ch, arr in train_clean.items()}
min_len = min(len(v) for v in ch_means.values())
aligned = {ch: v[:min_len] for ch, v in ch_means.items()}

# 2. Build correlation DataFrame
corr_df = pd.DataFrame(aligned).corr()

# 3. Attempt to map to nicer names (if you made a name map earlier)
# e.g. chan_name_map from previous scope
# fallback to chan_id string
labels_for_axis = [chan_name_map.get(ch, f"chan_{ch}") for ch in corr_df.index]

# 4. Plot with Seaborn
plt.style.use("dark_background")
plt.figure(figsize=(10, 8), dpi=200)
sns.heatmap(
    corr_df,
    xticklabels=labels_for_axis,
    yticklabels=labels_for_axis,
    cmap="coolwarm",
    center=0,
    annot=False,  # or True if you want values
    cbar_kws={"label": "Correlation"},
    linewidths=0.5,
    linecolor="#333333",
)

plt.title("Inter-Channel Correlation Heatmap (SMAP Telemetry)", color="white", fontsize=14)
plt.xticks(rotation=45, ha="right", color="white")
plt.yticks(rotation=0, color="white")
plt.tight_layout()

# Save
heatpath = RESULTS_DIR / "smap_channel_corr_heatmap_seaborn.png"
plt.savefig(heatpath, dpi=300, bbox_inches="tight")
plt.close()
print(f"[INFO] Saved channel correlation heatmap: {heatpath}")


[INFO] Generating channel-level correlation heatmap with labels…
[INFO] Saved channel correlation heatmap: /content/drive/MyDrive/General Satellite Anomaly Prediction/results/smap_channel_corr_heatmap_seaborn.png
