In [None]:
#Team Planet hunte AI model

In [None]:
# ================================
# Exoplanet Detection Pipeline (Google Colab Ready)
# ================================

# Install pinned packages for stability
!pip install --quiet xgboost==2.0.3 shap joblib optuna scikit-learn==1.5.2 tensorflow matplotlib seaborn

# -------------------------
# Imports & Setup
# -------------------------
import os, shutil, time, warnings
warnings.filterwarnings("ignore")
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# ML
import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import (accuracy_score, f1_score, classification_report, confusion_matrix,
                             roc_auc_score, roc_curve, precision_recall_curve, average_precision_score)
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.base import clone

# Deep learning
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# Explainability + tuning
import shap
import optuna

# Colab files
from google.colab import files

# -------------------------
# Config
# -------------------------
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

SAVE_DIR = Path("results_colab")
SAVE_DIR.mkdir(exist_ok=True)

USE_OPTUNA = False   # Set True for tuning (slower)
N_TRIALS = 12        # Optuna trials when enabled
FIG_DPI = 200

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("deep")
plt.rcParams.update({"figure.dpi": FIG_DPI, "savefig.dpi": FIG_DPI, "font.size": 10})

def savefig(fig, name):
    p = SAVE_DIR / f"{name}.png"
    fig.savefig(p, bbox_inches="tight", dpi=FIG_DPI)
    print("Saved:", p)

# -------------------------
# 1. Upload dataset
# -------------------------
print("ðŸ“‚ Upload your exoplanet_xgb_ready.csv file now")
uploaded = files.upload()
dataset_name = list(uploaded.keys())[0]
data = pd.read_csv(dataset_name).dropna(subset=["target"]).reset_index(drop=True)

X = data.drop("target", axis=1)
y = data["target"]
feature_names = X.columns.tolist()
print("Dataset:", data.shape, "| Classes:", sorted(y.unique()))

# -------------------------
# 2. Split & preprocess
# -------------------------
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=SEED)

scaler = StandardScaler().fit(X_train)
X_train_s, X_val_s, X_test_s = scaler.transform(X_train), scaler.transform(X_val), scaler.transform(X_test)

print("Train/Val/Test:", X_train.shape[0], X_val.shape[0], X_test.shape[0])

# -------------------------
# 3. Exploratory Plots
# -------------------------
fig = plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title("ðŸ”­ Class Distribution of Exoplanets")
plt.xlabel("Target class (0=Non-planet, 1=Confirmed Exoplanet)")
plt.ylabel("Count")
savefig(fig, "eda_class_counts"); plt.close(fig)

# PCA projection
pca = PCA(n_components=2, random_state=SEED)
X_pca2 = pca.fit_transform(X_train_s)
fig, ax = plt.subplots(figsize=(7,5))
sns.scatterplot(x=X_pca2[:,0], y=X_pca2[:,1], hue=y_train, s=15, alpha=0.7, palette="coolwarm")
plt.title("PCA Projection (2D)")
plt.xlabel("Principal Component 1 (Orbital Signature)")
plt.ylabel("Principal Component 2 (Stellar Flux)")
savefig(fig, "eda_pca2"); plt.close(fig)

# Correlation heatmap
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(X.corr(), cmap="mako", center=0, cbar_kws={"label":"Correlation"})
plt.title("Correlation Matrix of Astronomical Features")
savefig(fig, "eda_corr"); plt.close(fig)

# -------------------------
# 4. Baseline Models
# -------------------------
rf = RandomForestClassifier(n_estimators=300, random_state=SEED, n_jobs=-1)
rf.fit(X_train_s, y_train)

xgb_baseline = xgb.XGBClassifier(
    objective="multi:softprob", num_class=len(np.unique(y)),
    eval_metric="mlogloss", max_depth=6, learning_rate=0.1,
    n_estimators=400, random_state=SEED, use_label_encoder=False
)
xgb_baseline.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False,
                 callbacks=[EarlyStopping(rounds=30, save_best=True)])

# -------------------------
# 5. Evaluation helper
# -------------------------
def eval_and_save(name, model, X_e, y_true, scaled=False):
    X_in = scaler.transform(X_e) if scaled else X_e
    preds = model.predict(X_in)
    acc = accuracy_score(y_true, preds)
    f1 = f1_score(y_true, preds, average="weighted")
    print(f"\n{name}: Acc={acc:.4f} F1={f1:.4f}")
    print(classification_report(y_true, preds))
    cm = confusion_matrix(y_true, preds)
    fig, ax = plt.subplots(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="rocket_r",
                xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
    plt.title(f"{name} Confusion Matrix")
    savefig(fig, f"{name}_confusion"); plt.close(fig)
    return preds, acc, f1

rf_preds, rf_acc, rf_f1 = eval_and_save("RandomForest", rf, X_test, y_test, scaled=True)
xgb_preds, xgb_acc, xgb_f1 = eval_and_save("XGBoost_baseline", xgb_baseline, X_test, y_test, scaled=False)

# -------------------------
# 6. ROC & PR Curves
# -------------------------
def plot_roc_pr(model, X, y_true, name, scaled=False):
    X_in = scaler.transform(X) if scaled else X
    y_proba = model.predict_proba(X_in)
    n_classes = y_proba.shape[1]

    # ROC
    fig, ax = plt.subplots(figsize=(6,5))
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_true==i, y_proba[:,i])
        ax.plot(fpr, tpr, label=f"Class {i}")
    ax.plot([0,1],[0,1],"--",c="gray")
    ax.set_title(f"{name} ROC Curve")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.legend()
    savefig(fig, f"{name}_roc"); plt.close(fig)

    # PR curve
    fig, ax = plt.subplots(figsize=(6,5))
    for i in range(n_classes):
        prec, rec, _ = precision_recall_curve(y_true==i, y_proba[:,i])
        ax.plot(rec, prec, label=f"Class {i}")
    ax.set_title(f"{name} Precision-Recall Curve")
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.legend()
    savefig(fig, f"{name}_pr"); plt.close(fig)

plot_roc_pr(rf, X_test, y_test, "RandomForest", scaled=True)
plot_roc_pr(xgb_baseline, X_test, y_test, "XGBoost", scaled=False)

# -------------------------
# 7. Learning Curves (fixed for XGB)
# -------------------------
def plot_learning_curve_custom(model, X, y, name, scaled=False):
    """Works for sklearn-compatible models like RandomForest"""
    from sklearn.model_selection import learning_curve
    X_in = scaler.transform(X) if scaled else X
    train_sizes, train_scores, val_scores = learning_curve(clone(model), X_in, y,
                                                           cv=3, scoring="accuracy",
                                                           n_jobs=-1,
                                                           train_sizes=np.linspace(0.1, 1.0, 5))
    fig, ax = plt.subplots(figsize=(7,5))
    ax.plot(train_sizes, train_scores.mean(1), "o-", label="Train")
    ax.plot(train_sizes, val_scores.mean(1), "o-", label="CV")
    ax.set_title(f"{name} Learning Curve")
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Accuracy")
    ax.legend()
    savefig(fig, f"{name}_learning_curve"); plt.close(fig)

def plot_xgb_learning_curve(model, X, y, name):
    """Custom incremental learning curve for XGBoost"""
    train_sizes = np.linspace(0.1, 1.0, 6)
    train_scores, val_scores = [], []
    for frac in train_sizes:
        n = int(len(X) * frac)
        X_sub, y_sub = X[:n], y[:n]
        m = xgb.XGBClassifier(**model.get_params())
        m.fit(X_sub, y_sub, eval_set=[(X_val, y_val)], verbose=False)
        train_preds = m.predict(X_sub)
        val_preds = m.predict(X_val)
        train_scores.append(accuracy_score(y_sub, train_preds))
        val_scores.append(accuracy_score(y_val, val_preds))
    fig, ax = plt.subplots(figsize=(7,5))
    ax.plot(train_sizes*len(X), train_scores, "o-", label="Train")
    ax.plot(train_sizes*len(X), val_scores, "o-", label="Validation")
    ax.set_title(f"{name} Learning Curve (Custom)")
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Accuracy")
    ax.legend()
    savefig(fig, f"{name}_learning_curve"); plt.close(fig)

# Run both
plot_learning_curve_custom(rf, X_train_s, y_train, "RandomForest", scaled=False)
plot_xgb_learning_curve(xgb_baseline, X_train, y_train, "XGBoost")


# -------------------------
# 8. Feature Importance (Permutation + XGB Gain)
# -------------------------
perm = permutation_importance(rf, X_test_s, y_test, n_repeats=20, random_state=SEED, n_jobs=-1)
sorted_idx = perm.importances_mean.argsort()[-15:]
fig, ax = plt.subplots(figsize=(7,5))
ax.barh(np.array(feature_names)[sorted_idx], perm.importances_mean[sorted_idx])
ax.set_title("RandomForest Feature Importance (Permutation)")
savefig(fig, "rf_feature_importance"); plt.close(fig)

xgb_imp = pd.DataFrame({"feature": feature_names, "importance": xgb_baseline.feature_importances_})
xgb_imp = xgb_imp.sort_values("importance", ascending=False).head(15)
fig, ax = plt.subplots(figsize=(7,5))
sns.barplot(x="importance", y="feature", data=xgb_imp, palette="viridis", ax=ax)
ax.set_title("XGBoost Feature Importance (Gain)")
savefig(fig, "xgb_feature_importance"); plt.close(fig)

# -------------------------
# 9. SHAP Analysis
# -------------------------
SHAP_SAMPLE = 500
X_shap = X_test.sample(min(SHAP_SAMPLE, X_test.shape[0]), random_state=SEED)
explainer = shap.Explainer(xgb_baseline, X_train.sample(min(2000, X_train.shape[0]), random_state=SEED))
shap_vals = explainer(X_shap)
shap.summary_plot(shap_vals, X_shap, plot_type="bar", show=False)
plt.gcf().set_size_inches(7,5); plt.tight_layout()
plt.savefig(SAVE_DIR/"shap_summary.png", dpi=FIG_DPI, bbox_inches="tight"); plt.close()

# -------------------------
# 10. Save Models
# -------------------------
joblib.dump(rf, SAVE_DIR/"rf.joblib")
joblib.dump(xgb_baseline, SAVE_DIR/"xgb.joblib")
print("Models saved to:", SAVE_DIR)

# Bundle results
shutil.make_archive("results_colab", "zip", SAVE_DIR)
files.download("results_colab.zip")
print("âœ… All done â€” artifacts downloaded.")
