In [13]:
# Necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_curve, 
    roc_auc_score
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import permutation_importance

In [14]:
#dataset
df = pd.read_csv("inputs-for-ml/final_ml_data.csv")

In [15]:
# Define features and target
feature_cols = ['elevation', 'slope', 'north_gps', 'east_gps', 'vertical_gps', 'coherence', 'los_insar', 'bias']
X = df[feature_cols]
y = df['needs_calibration']

In [16]:
# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split (Taking 70-30)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
) #42 from hhgg

In [17]:
#Function to train and evaluate models
def train_and_evaluate(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    name = model.__class__.__name__
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    print(f"\n{name}")
    print(cm)
    print(classification_report(y_test, y_pred))
    print("Accuracy:", acc)

    y_prob = None
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        decision_scores = model.decision_function(X_test)
        y_prob = MinMaxScaler().fit_transform(decision_scores.reshape(-1, 1)).ravel()

    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc_score = roc_auc_score(y_test, y_prob)
    else:
        fpr, tpr, auc_score = None, None, np.nan

    return {
        'Model': name,
        'Accuracy': acc,
        'ROC AUC': auc_score,
        'Confusion Matrix': cm,
        'ROC Curve': (fpr, tpr, auc_score) if fpr is not None else None
    }


In [None]:
# List of models to compare
models = [
    SVC(probability=True),
    MLPClassifier(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
]

# Evaluate all models and collect results
results = [train_and_evaluate(model) for model in models]

In [19]:
# Separate results into performance metrics and plots
performance_records = []
confusion_matrices = {}
roc_curves = {}

for res in results:
    performance_records.append({
        'Model': res['Model'],
        'Accuracy': res['Accuracy'],
        'ROC AUC': res['ROC AUC']
    })
    confusion_matrices[res['Model']] = res['Confusion Matrix']
    if res['ROC Curve']:
        roc_curves[res['Model']] = res['ROC Curve']

# Create DataFrame to display model performance
performance_df = pd.DataFrame(performance_records)
performance_df = performance_df.sort_values(by=['Accuracy', 'ROC AUC'], ascending=False)

In [None]:
# Plot ROC curves
plt.figure(figsize=(8, 6))
for name, (fpr, tpr, auc_score) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{name}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Plot individual confusion matrices
for name, cm in confusion_matrices.items():
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=[0, 1], yticklabels=[0, 1], ax=ax, cbar=False)
    ax.set_title(f'Confusion Matrix - {name}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')
    plt.tight_layout()
    plt.show()

In [None]:
# Display the performance table
print("\nModel Performance Ranking:")
print(performance_df)

In [23]:
# Extended Hyperparameter Tuning for Classification Models

# Define classification models with hyperparameter grids
param_grids_classification = {
    "Support Vector Classifier": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["rbf", "linear"]
        }
    },
    "Neural Network Classifier": {
        "model": MLPClassifier(max_iter=1000, random_state=42),
        "params": {
            "hidden_layer_sizes": [(50,), (100,), (100, 50)],
            "alpha": [0.0001, 0.001],
            "learning_rate_init": [0.001, 0.01]
        }
    },
    "Decision Tree Classifier": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest Classifier": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Gradient Boosting Classifier": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 5, 7]
        }
    },
    "Naive Bayes Classifier": {
        "model": GaussianNB(),
        "tune": False
    }
}

In [None]:

# Initialize dictionaries to store best models and feature importances
best_classifiers = {}
class_feature_importance = {}

# Train and tune classifiers
for name, settings in param_grids_classification.items():
    model = settings["model"]
    if settings.get("tune", True):
        grid_search = GridSearchCV(model, settings["params"], cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_model = model

    best_classifiers[name] = best_model

    # Extract feature importances
    if hasattr(best_model, "feature_importances_"):
        class_feature_importance[name] = best_model.feature_importances_
    elif hasattr(best_model, "coef_"):
        class_feature_importance[name] = np.abs(best_model.coef_).flatten()
    else:
        result = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
        class_feature_importance[name] = result.importances_mean

In [None]:

# Create DataFrame for feature importance
feature_importance_df_class = pd.DataFrame(class_feature_importance, index=feature_cols)
feature_importance_df_class = feature_importance_df_class.clip(lower=0)

# Normalize feature importance values across models
feature_importance_df_class = feature_importance_df_class.div(feature_importance_df_class.sum(axis=0), axis=1)

# Plot feature importances across models
fig, ax = plt.subplots(figsize=(12, 8))
feature_importance_df_class.plot(kind="barh", ax=ax, colormap="tab10")
plt.gca().invert_yaxis()
plt.title("Feature Importance Comparison Across Classification Models", fontsize=14, weight="bold")
plt.xlabel("Normalized Importance Score")
plt.grid(True)
plt.tight_layout()
plt.show()

# Print feature importance table
print("\nNormalized Feature Importance (Classification Models):")
print(feature_importance_df_class.round(3))

In [None]:
# Evaluate all tuned classifiers
performance_list_class = []

for name, model in best_classifiers.items():
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        decision_scores = model.decision_function(X_test)
        y_prob = MinMaxScaler().fit_transform(decision_scores.reshape(-1, 1)).ravel()
    else:
        y_prob = np.zeros_like(y_pred)  # fallback if no probability

    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    
    performance_list_class.append({"Model": name, "Accuracy": acc, "ROC AUC": roc_auc})

# Create performance DataFrame
performance_df_class = pd.DataFrame(performance_list_class)
performance_df_class.rename(columns=lambda x: x.strip().upper(), inplace=True)
performance_df_class = performance_df_class.round({"ACCURACY": 3, "ROC AUC": 3})
performance_df_class = performance_df_class.sort_values(by="ROC AUC", ascending=False).reset_index(drop=True)
performance_df_class.index += 1
performance_df_class.index.name = "Rank"

# Print classification performance
print("\nTuned Classification Model Performance:")
print(performance_df_class)