<a href="https://colab.research.google.com/github/erwanBellon/Project-ML-SDM/blob/main/code/LogicReg_model_habitat_amount_3000_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part I: Setup



In [20]:

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Cloning repo or fetch latest changes and path management
%cd /content
!rm -rf Project-ML-SDM
!git clone https://github.com/erwanBellon/Project-ML-SDM.git
%cd /content/Project-ML-SDM
!git pull

import os
from pathlib import Path

# Move into the project directory
%cd /content/Project-ML-SDM/code
print("Current working directory:", Path.cwd())

# Define main project dir and outputs
PROJECT_ROOT_DIR = Path.cwd().parent
OUTPUTS_PATH = PROJECT_ROOT_DIR / "outputs"
OUTPUTS_PATH.mkdir(parents=True, exist_ok=True)
print("Outputs will be saved to:", OUTPUTS_PATH)

# Common imports
import numpy as np
import pandas as pd
import rasterio
!pip install pyreadr
import pyreadr
import matplotlib.pyplot as plt
import joblib


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix

np.random.seed(123)

/content
Cloning into 'Project-ML-SDM'...
remote: Enumerating objects: 407, done.[K
remote: Counting objects: 100% (407/407), done.[K
remote: Compressing objects: 100% (400/400), done.[K
remote: Total 407 (delta 332), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (407/407), 8.15 MiB | 14.07 MiB/s, done.
Resolving deltas: 100% (332/332), done.
/content/Project-ML-SDM
Already up to date.
/content/Project-ML-SDM/code
Current working directory: /content/Project-ML-SDM/code
Outputs will be saved to: /content/Project-ML-SDM/outputs


In [2]:
print(Path.cwd())

/content/Project-ML-SDM/code


# Part 2: Load files
## 2.1: Load the presences and absences landcover crops

In [9]:
# --- Load table data ---
rds_path = Path("../data/Table_preds/function_3_100.rds")
result = pyreadr.read_r(rds_path)
table_df = result[None]

# Select relevant columns
feature_df = table_df[['MAP','MAT','habitat_amount_3000']].astype(float)

# Labels come from presence/absence index lists
# Load image index files (no images) (just to get the indices that I need from my datatable)
presences_path = Path("../data/cropped_landcover/presences")
absences_path = Path("../data/cropped_landcover/absences")

def extract_indices(folder):
    tif_files = list(folder.glob("*.tif"))
    indices = []
    for tif in tif_files:
        idx = int(tif.stem.split("_")[-1])
        indices.append(idx)
    return np.array(indices)

indices_pres = extract_indices(presences_path)
indices_abs = extract_indices(absences_path)

# Build label vector
y = np.concatenate([
    np.ones(len(indices_pres)),
    np.zeros(len(indices_abs))
]).astype(np.float32)

# Extract corresponding rows
feature_all = feature_df
features = feature_all.iloc[np.concatenate([indices_pres, indices_abs])]
features = features.reset_index(drop=True)

# Normalize: no need to normalize after to avoid dataleakage
#table_features = (table_features - table_features.min()) / (table_features.max() - table_features.min())
#table_features = table_features.to_numpy(dtype=np.float32)

# Train/valid/test split
X_train, X_temp, y_train, y_temp = train_test_split(
    features, y, test_size=0.2, random_state=123, stratify=y
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=123, stratify=y_temp
)


# Normalize using TRAIN ONLY

min_vals = X_train.min(axis=0)
max_vals = X_train.max(axis=0)

X_train = (X_train - min_vals) / (max_vals - min_vals)
X_valid = (X_valid - min_vals) / (max_vals - min_vals)
X_test  = (X_test  - min_vals) / (max_vals - min_vals)





Note that there is some data leakage here as I pre-process part of my test set with the data of the training for normalizing the features. I don't have the time to change that... I've realised to late

# 3. Logisitc regression model Training

In [10]:
maxent = LogisticRegression(
    penalty="l2",        # MaxEnt regularization
    C=1.0,               # inverse regularization strength
    solver="lbfgs",
    max_iter=1000
)

# Train
maxent.fit(X_train, y_train)

# 4. Model evaluation

AUC

In [11]:
val_probs  = maxent.predict_proba(X_valid)[:, 1]
test_probs = maxent.predict_proba(X_test)[:, 1]

val_auc  = roc_auc_score(y_valid, val_probs)
test_auc = roc_auc_score(y_test, test_probs)

# Confusion matrix (threshold = 0.5)
y_test_pred = (test_probs > 0.5).astype(int)
print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_test_pred))

Confusion matrix (test):
[[19  1]
 [ 7  5]]


feature importance

In [16]:
coef_df = pd.DataFrame({
    "feature": X_train.columns,
    "weight": maxent.coef_[0]
}).sort_values("weight", ascending=False)
print(f"Feature importance: {coef_df}")

Feature importance:                feature    weight
0                  MAP  1.959142
2  habitat_amount_3000  1.504063
1                  MAT -1.279960


MaxTSS is not implemented in keras. I'll compute
it there:

In [12]:
def compute_maxTSS(y_true, y_prob, n_thresholds=200):
    thresholds = np.linspace(0, 1, n_thresholds)
    best_tss = -1.0
    best_threshold = 0.5

    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

        sensitivity = tp / (tp + fn + 1e-9)   # TPR
        specificity = tn / (tn + fp + 1e-9)   # TNR

        tss = sensitivity + specificity - 1

        if tss > best_tss:
            best_tss = tss
            best_threshold = t

    return best_tss, best_threshold


In [13]:
# Probabilities
val_probs  = maxent.predict_proba(X_valid)[:, 1]
test_probs = maxent.predict_proba(X_test)[:, 1]

# AUC
val_auc  = roc_auc_score(y_valid, val_probs)
test_auc = roc_auc_score(y_test, test_probs)

# Max TSS on test set
max_tss, best_threshold = compute_maxTSS(y_test, test_probs)

print(f"Validation AUC: {val_auc:.3f}")
print(f"Test AUC:       {test_auc:.3f}")
print(f"Max TSS:        {max_tss:.3f}")
print(f"Best threshold: {best_threshold:.3f}")


Validation AUC: 0.821
Test AUC:       0.754
Max TSS:        0.533
Best threshold: 0.427


Save my best model and mannually load it in GitHub

In [22]:
from google.colab import files

joblib.dump(maxent, "LogReg_HA_3000_model.joblib")
files.download("LogReg_HA_3000_model.joblib")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>