In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Read the dataset
df = pd.read_csv("inputs-for-ml/final_ml_data.csv")

In [4]:
# Select features and target
X = df[["slope", "elevation", "north_gps", "east_gps", "vertical_gps", "coherence","los_insar"]]
y = df["bias"]

In [5]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

In [6]:
# Define regression models
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regression": SVR(),
    "Neural Network Regression": MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Random Forest Regression": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [None]:
# Initialize performance list
performance_list = []

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    performance_list.append({"Model": name, "RMSE": rmse, "R2": r2})

    # Plot predicted vs actual and Q-Q plot
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    # Scatter plot: Predicted vs Actual
    sns.regplot(
        x=y_test, y=y_pred,
        scatter_kws={"s": 20},
        line_kws={"color": "red", "linestyle": "--"},
        ax=axes[0]
    )
    axes[0].set_xlabel("Actual Bias")
    axes[0].set_ylabel("Predicted Bias")
    axes[0].set_title(f"{name} - Predicted vs Actual")
    axes[0].grid(True)
    axes[0].set_aspect('equal', adjustable='box')

    textstr = f"RMSE: {rmse:.3f}\nR²: {r2:.3f}"
    axes[0].text(
        0.05, 0.95, textstr,
        transform=axes[0].transAxes,
        fontsize=10,
        verticalalignment='top',
        bbox=dict(boxstyle="round", facecolor="white", alpha=0.7)
    )

    # Q-Q plot: Residuals
    residuals = y_test - y_pred
    stats.probplot(residuals, dist="norm", plot=axes[1])
    axes[1].get_lines()[1].set_color('red')
    axes[1].set_xlabel("Actual Bias Quantiles")
    axes[1].set_ylabel("Estimated Bias Quantiles")
    axes[1].set_title(f"{name} - Q-Q Plot of Residuals")
    axes[1].grid(True)

    plt.suptitle(f"{name} - Performance Analysis", fontsize=14, weight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95], w_pad=4)
    plt.show()

In [None]:
# Create performance DataFrame
performance_df = pd.DataFrame(performance_list)

# Clean and format columns
performance_df.rename(columns=lambda x: x.strip().upper(), inplace=True)
performance_df = performance_df.round({"RMSE": 3, "R2": 3})
performance_df = performance_df.sort_values(by="R2", ascending=False).reset_index(drop=True)
performance_df.index += 1
performance_df.index.name = "Rank"

# Print performance table
print(performance_df)