In [1]:
# Necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_curve, 
    roc_auc_score
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [2]:
#dataset
df = pd.read_csv("inputs-for-ml/final_ml_data.csv")

In [3]:
# Define features and target
feature_cols = ['elevation', 'slope', 'north_gps', 'east_gps', 'vertical_gps', 'coherence', 'los_insar', 'bias']
X = df[feature_cols]
y = df['needs_calibration']

In [4]:
# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split (Taking 70-30)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
) #42 from hhgg

In [5]:
#Function to train and evaluate models
def train_and_evaluate(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    name = model.__class__.__name__
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    print(f"\n{name}")
    print(cm)
    print(classification_report(y_test, y_pred))
    print("Accuracy:", acc)

    y_prob = None
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        decision_scores = model.decision_function(X_test)
        y_prob = MinMaxScaler().fit_transform(decision_scores.reshape(-1, 1)).ravel()

    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc_score = roc_auc_score(y_test, y_prob)
    else:
        fpr, tpr, auc_score = None, None, np.nan

    return {
        'Model': name,
        'Accuracy': acc,
        'ROC AUC': auc_score,
        'Confusion Matrix': cm,
        'ROC Curve': (fpr, tpr, auc_score) if fpr is not None else None
    }


In [None]:
# List of models to compare
models = [
    SVC(probability=True),
    MLPClassifier(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
]

# Evaluate all models and collect results
results = [train_and_evaluate(model) for model in models]

In [7]:
# Separate results into performance metrics and plots
performance_records = []
confusion_matrices = {}
roc_curves = {}

for res in results:
    performance_records.append({
        'Model': res['Model'],
        'Accuracy': res['Accuracy'],
        'ROC AUC': res['ROC AUC']
    })
    confusion_matrices[res['Model']] = res['Confusion Matrix']
    if res['ROC Curve']:
        roc_curves[res['Model']] = res['ROC Curve']

# Create DataFrame to display model performance
performance_df = pd.DataFrame(performance_records)
performance_df = performance_df.sort_values(by=['Accuracy', 'ROC AUC'], ascending=False)

In [None]:
# Plot ROC curves
plt.figure(figsize=(8, 6))
for name, (fpr, tpr, auc_score) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{name}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Plot individual confusion matrices
for name, cm in confusion_matrices.items():
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=[0, 1], yticklabels=[0, 1], ax=ax, cbar=False)
    ax.set_title(f'Confusion Matrix - {name}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')
    plt.tight_layout()
    plt.show()

In [None]:
# Display the performance table
print("\nModel Performance Ranking:")
print(performance_df)