<a href="https://colab.research.google.com/github/erwanBellon/Project-ML-SDM/blob/main/code/maxent_model_habitat_amount_3000_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part I: Setup



In [None]:

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Cloning repo or fetch latest changes and path management
!git clone https://github.com/erwanBellon/Project-ML-SDM.git
%cd /content/Project-ML-SDM
!git pull

import os
from pathlib import Path

# Move into the project directory
%cd /content/Project-ML-SDM/code
print("Current working directory:", Path.cwd())

# Define main project dir and outputs
PROJECT_ROOT_DIR = Path.cwd().parent       # -> /content/2025_ML_EES/project
OUTPUTS_PATH = PROJECT_ROOT_DIR / "outputs"
OUTPUTS_PATH.mkdir(parents=True, exist_ok=True)
print("Outputs will be saved to:", OUTPUTS_PATH)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, Input
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU detected. CNNs can be slow without GPU.")

# Common imports
import pandas as pd
import numpy as np
!pip install rasterio
import rasterio
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
!pip install -U keras-tuner
import keras_tuner as kt

# To make notebook reproducible
np.random.seed(42)
tf.random.set_seed(42)

# For plots
import matplotlib.pyplot as plt
%matplotlib inline

# Load Tensorboard
%load_ext tensorboard

Cloning into 'Project-ML-SDM'...
remote: Enumerating objects: 362, done.[K
remote: Counting objects: 100% (362/362), done.[K
remote: Compressing objects: 100% (356/356), done.[K
remote: Total 362 (delta 310), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (362/362), 8.13 MiB | 15.98 MiB/s, done.
Resolving deltas: 100% (310/310), done.
/content/Project-ML-SDM
Already up to date.
/content/Project-ML-SDM/code
Current working directory: /content/Project-ML-SDM/code
Outputs will be saved to: /content/Project-ML-SDM/outputs


In [None]:
print(Path.cwd())

/content/Project-ML-SDM/code


# Part 2: Load files
## 2.1: Load the presences and absences landcover crops

In [None]:
# --- Load table data ---
rds_path = Path("../data/Table_preds/function_3_100.rds")
!pip install pyreadr
import pyreadr
result = pyreadr.read_r(rds_path)
table_df = result[None]

# Select relevant columns
feature_df = table_df[['MAP','MAT','habitat_amount_3000']].astype(float)

# Labels come from presence/absence index lists
# Load image index files (no images) (just to get the indices that I need from my datatable)
presences_path = Path("../data/cropped_landcover/presences")
absences_path = Path("../data/cropped_landcover/absences")

def extract_indices(folder):
    tif_files = list(folder.glob("*.tif"))
    indices = []
    for tif in tif_files:
        idx = int(tif.stem.split("_")[-1])
        indices.append(idx)
    return np.array(indices)

indices_pres = extract_indices(presences_path)
indices_abs = extract_indices(absences_path)

# Build label vector
y = np.concatenate([
    np.ones(len(indices_pres)),
    np.zeros(len(indices_abs))
]).astype(np.float32)

# Extract corresponding rows
feature_all = feature_df
features = feature_all.iloc[np.concatenate([indices_pres, indices_abs])]
features = features.reset_index(drop=True)

# Normalize: no need to normalize after to avoid dataleakage
#table_features = (table_features - table_features.min()) / (table_features.max() - table_features.min())
#table_features = table_features.to_numpy(dtype=np.float32)

# Train/valid/test split
X_train, X_temp, y_train, y_temp = train_test_split(
    features, y, test_size=0.2, random_state=42, stratify=y
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)


# Normalize using TRAIN ONLY

min_vals = X_train.min(axis=0)
max_vals = X_train.max(axis=0)

X_train = (X_train - min_vals) / (max_vals - min_vals)
X_valid = (X_valid - min_vals) / (max_vals - min_vals)
X_test  = (X_test  - min_vals) / (max_vals - min_vals)





Collecting pyreadr
  Downloading pyreadr-0.5.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (1.3 kB)
Downloading pyreadr-0.5.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (776 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/776.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/776.2 kB[0m [31m31.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.2/776.2 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadr
Successfully installed pyreadr-0.5.4


Note that there is some data leakage here as I pre-process part of my test set with the data of the training for normalizing the features. I don't have the time to change that... I've realised to late

## 2.2 Preprocess the data

In [None]:
BATCH_SIZE = 8

def preprocess_table(table, label):
    table = tf.cast(table, tf.float32)
    label = tf.cast(label, tf.float32)
    return table, label

train_ds = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train))
      .shuffle(1000)
      .map(preprocess_table)
      .batch(BATCH_SIZE)
      .prefetch(tf.data.AUTOTUNE)
)

valid_ds = (
    tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
      .map(preprocess_table)
      .batch(BATCH_SIZE)
      .prefetch(tf.data.AUTOTUNE)
)

test_ds = (
    tf.data.Dataset.from_tensor_slices((X_test, y_test))
      .map(preprocess_table)
      .batch(BATCH_SIZE)
      .prefetch(tf.data.AUTOTUNE)
)

train_ds.name = "Training"
valid_ds.name = "Validation"
test_ds.name = "Test"


# 3. ANN model Training

Doing an optised ANN using hyperparameter hyperband tuning

In [None]:
def build_model(hp):

    # -----------------------
    # Hyperparameters to tune
    # -----------------------
    n_units_1 = hp.Choice("units_1", [8, 16, 32, 64])
    n_units_2 = hp.Choice("units_2", [4, 8, 16, 32])
    n_units_3 = hp.Choice("units_3", [4, 8, 16])

    dropout_rate = hp.Choice("dropout", [0.2, 0.3, 0.4, 0.5])
    lr = hp.Choice("lr", [1e-2, 1e-3, 5e-4, 1e-4])

    # -----------------------
    # Model definition
    # -----------------------
    table_input = keras.Input(shape=(3,), name="table_input")

    x = layers.Dense(n_units_1, activation="relu")(table_input)
    x = layers.Dense(n_units_2, activation="relu")(x)

    x = layers.Dense(n_units_3, activation="relu")(x)
    x = layers.Dropout(dropout_rate)(x)

    output = layers.Dense(1, activation="sigmoid", name="suitability")(x)

    model = keras.Model(inputs=table_input, outputs=output)

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(name="auc")]
    )

    return model


Create the hyperband

In [None]:
tuner = kt.Hyperband(
    build_model,
    objective=kt.Objective("val_auc", direction="max"),
    max_epochs=50,
    factor=3,
    directory="hyper_param_tuning",
    project_name="ANN_table_tuning",
    overwrite=True   # start fresh
)


In [None]:
stop_early = keras.callbacks.EarlyStopping(
    monitor="val_auc",
    patience=10,
    mode="max",
    restore_best_weights=True
)


tuner.search(
    train_ds,
    validation_data=valid_ds,
    epochs=50,
    callbacks=[stop_early]
)


In [None]:
best_hp = tuner.get_best_hyperparameters(1)[0]

print("Best hyperparameters:")
for k, v in best_hp.values.items():
    print(f"  {k}: {v}")

best_model = tuner.get_best_models(1)[0]


Best hyperparameters:
  units_1: 16
  units_2: 32
  units_3: 8
  dropout: 0.4
  lr: 0.0005
  tuner/epochs: 6
  tuner/initial_epoch: 0
  tuner/bracket: 2
  tuner/round: 0


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:

history = best_model.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=50,
    callbacks=[stop_early]
)

Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 66ms/step - auc: 0.6722 - loss: 0.6264 - val_auc: 0.8583 - val_loss: 0.6183
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.7092 - loss: 0.6292 - val_auc: 0.8604 - val_loss: 0.6069
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.6396 - loss: 0.5957 - val_auc: 0.8583 - val_loss: 0.5983
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - auc: 0.6483 - loss: 0.6270 - val_auc: 0.8396 - val_loss: 0.5900
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - auc: 0.6707 - loss: 0.6168 - val_auc: 0.8333 - val_loss: 0.5827
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - auc: 0.7452 - loss: 0.5918 - val_auc: 0.8208 - val_loss: 0.5710
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - auc: 0.7279 -

In [None]:
test_loss, test_auc = best_model.evaluate(test_ds)
print("\n Final Test AUC =", test_auc)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - auc: 0.5926 - loss: 0.6591 

 Final Test AUC = 0.6666666269302368


MaxTSS is not implemented in keras. I'll compute
it there:

In [None]:
def compute_maxTSS(y_true, y_prob):
    thresholds = np.linspace(0, 1, 200)
    best_tss = -2
    best_threshold = None

    for t in thresholds:
        preds = (y_prob >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()

        sensitivity = tp / (tp + fn + 1e-9)
        specificity = tn / (tn + fp + 1e-9)
        tss = sensitivity + specificity - 1

        if tss > best_tss:
            best_tss = tss
            best_threshold = t

    return best_tss, best_threshold


In [None]:
y_prob = best_model.predict(test_ds)
y_true = np.concatenate([y for (x, y) in test_ds])
max_tss, best_threshold = compute_maxTSS(y_true, y_prob)

print("MaxTSS:", max_tss)
print("Optimal threshold:", best_threshold)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
MaxTSS: 0.44999999990249995
Optimal threshold: 0.34673366834170855


Save my best model and mannually load it in GitHub

In [None]:
from google.colab import files
best_model.save("Optimised_ANN_HA_3000_bestModel.keras")
files.download("Optimised_ANN_HA_3000_bestModel.keras")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>