In [None]:
# === Setup: install & mount ===
!pip -q install rasterio geopandas shapely fiona scikit-learn joblib

import os
import numpy as np
import rasterio
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=False)

# === Path dasar data (ubah jika perlu) ===
SAWAH_DIR = "/content/drive/MyDrive/Sawah2"  # ganti ke folder kerjamu
assert os.path.exists(SAWAH_DIR), f"Folder tidak ditemukan: {SAWAH_DIR}"

# === Utilitas ===
def read_raster(path):
    """Return (array, profile). Nodata dipetakan ke np.nan (float32)."""
    assert os.path.exists(path), f"Tidak ditemukan: {path}"
    with rasterio.open(path) as src:
        arr = src.read(1)  # biarkan dtype asli (bisa int utk kelas)
        nod = src.nodata
        prof = src.profile.copy()
        # jika array bukan float, tetap biarkan; np.nan butuh float ‚Üí tangani hanya jika nodata ada
        if nod is not None and np.issubdtype(arr.dtype, np.number):
            arr = arr.astype('float32', copy=False)
            arr = np.where((arr == nod) | (~np.isfinite(arr)), np.nan, arr)
        return arr, prof

def norm01(a):
    """Normalisasi 0‚Äì1 abaikan NaN."""
    a = a.astype('float32', copy=False)
    m = np.nanmin(a); M = np.nanmax(a)
    if not np.isfinite(m) or not np.isfinite(M) or M == m:
        return np.zeros_like(a, dtype='float32')
    out = (a - m) / (M - m)
    # pertahankan NaN
    out[~np.isfinite(a)] = np.nan
    return out


# **Random Forest‚ÄìBased Suitability Mapping (2008‚Äì2016)**

In [None]:
"""
==============================================================
Script A ‚Äî Random Forest & Suitability Mapping (2008) ‚Äî 3-Channel
==============================================================

Author  : Firmaness
Date    : 2025-11-11
Version : 1.0
Purpose : Train a Random Forest model to generate 3-channel
          land-use suitability maps (Paddy Field, Built-up Land, Others)
          for the 2008 baseline year.

Environment:
    Python 3.10 (Google Colab)
    scikit-learn 1.3, numpy 1.24, pandas 2.0, rasterio 1.3,
    geopandas 0.13, shapely 2.0, matplotlib 3.7, statsmodels 0.14
"""

import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import numpy as np
import os

# Optional: required only when writing GeoTIFF multi-band output
# import rasterio

# =============================================================
# 1) LOAD DATA
# =============================================================

# Target variable (dependent variable)
PL_08, PROFILE = read_raster(os.path.join(SAWAH_DIR, "PL_2008.tif"))

# Predictor factors selected from VIF analysis
CAT_FACTORS = ["KL.tif", "ZNT_Edit.tif"]  # categorical
CONT_FACTORS = [
    "Jalan_Arteri.tif", "Jalan_Kolektor.tif", "Jalan_Lokal.tif",
    "Distribusi_2008.tif"                   # additional factor for 2008
]

# =============================================================
# 2) CONFIGURATION FOR 3-CHANNEL AGGREGATION
#    Define class grouping (original PL_2008 / rf.classes_) ‚Üí 3 groups
#    Example: 1 = Paddy field, 2 = Built-up, others = Remaining
# =============================================================

GROUPS = {
    "Paddy_Field": [1],         # adjust based on class code in PL_2008
    "Built_Up_Land": [2],       # adjust based on built-up land code
    "Others": "rest"            # remaining classes not listed above
}

# =============================================================
# 3) UTILITY FUNCTIONS
# =============================================================

def load_factors_stack(folder):
    """
    Load and normalize predictor raster layers.

    Parameters
    ----------
    folder : str
        Directory path containing raster layers.

    Returns
    -------
    factors : np.ndarray
        Stacked raster array (H, W, B).
    names : list[str]
        List of factor names.
    cat_idx : list[int]
        Indices of categorical variables.
    """
    bands, names, cat_idx = [], [], []

    # Categorical factors: replace NaN with -1 sentinel, keep as float32
    for nm in CAT_FACTORS:
        a, _ = read_raster(os.path.join(folder, nm))
        a_cat = np.where(np.isfinite(a), a, -1).astype("float32")
        bands.append(a_cat)
        names.append(nm)
        cat_idx.append(len(bands) - 1)

    # Continuous factors: normalize 0‚Äì1 (keep NaN as-is)
    for nm in CONT_FACTORS:
        a, _ = read_raster(os.path.join(folder, nm))
        bands.append(norm01(a))
        names.append(nm)

    return np.stack(bands, axis=-1), names, cat_idx  # (H, W, B)


FACTORS_2008, FACTOR_NAMES, CAT_IDX = load_factors_stack(SAWAH_DIR)

# =============================================================
# 4) RANDOM FOREST TRAINING
# =============================================================

def make_train_arrays(factors, target, cat_idx, nodata_class=255):
    """
    Prepare valid training samples for model fitting.

    Parameters
    ----------
    factors : np.ndarray
        Stacked predictor variables (H, W, B).
    target : np.ndarray
        Target land-use raster (H, W).
    cat_idx : list[int]
        Indices of categorical factors.
    nodata_class : int
        Value representing no-data pixels.

    Returns
    -------
    X : np.ndarray
        Predictor samples.
    y : np.ndarray
        Target classes.
    """
    H, W, B = factors.shape
    X = factors.reshape(-1, B)
    y = target.reshape(-1)

    m = np.isfinite(y) & (y != nodata_class)
    for b in range(B):
        if b in cat_idx:
            m &= np.isfinite(X[:, b]) & (X[:, b] != -1)
        else:
            m &= np.isfinite(X[:, b])

    return X[m], y[m].astype(int)


def tune_and_fit_rf(factors, target, factor_names, cat_idx, sample_size=50_000, seed=42):
    """
    Tune hyperparameters and fit the Random Forest classifier.

    Parameters
    ----------
    factors : np.ndarray
        Predictor variables.
    target : np.ndarray
        Target land-use classes.
    factor_names : list[str]
        Predictor variable names.
    cat_idx : list[int]
        Indices of categorical factors.
    sample_size : int, optional
        Number of random samples used for tuning.
    seed : int, optional
        Random seed for reproducibility.

    Returns
    -------
    rf : RandomForestClassifier
        Trained Random Forest model.
    """
    from numpy.random import RandomState

    X, y = make_train_arrays(factors, target, cat_idx)

    if len(y) > sample_size:
        rng = RandomState(seed)
        idx = rng.choice(len(y), size=sample_size, replace=False)
        X_sub, y_sub = X[idx], y[idx]
    else:
        X_sub, y_sub = X, y

    param_dist = {
        "n_estimators": [100, 150, 200, 250, 300],
        "max_depth": [20, 30, 40, None],
        "min_samples_split": [2, 5, 10, 20],
        "min_samples_leaf": [1, 2, 4, 10],
        "max_features": ['sqrt', 'log2', None]
    }

    base = RandomForestClassifier(
        class_weight="balanced", n_jobs=-1, random_state=seed
    )

    search = RandomizedSearchCV(
        base, param_distributions=param_dist, n_iter=10, cv=3,
        random_state=seed, verbose=1, n_jobs=-1
    )
    search.fit(X_sub, y_sub)
    rf = search.best_estimator_

    print("Best parameters:", search.best_params_)
    rf.fit(X, y)
    print("\nRF report (full training):\n", classification_report(y, rf.predict(X)))
    print("Trained classes (rf.classes_):", rf.classes_)

    return rf


RF_08 = tune_and_fit_rf(FACTORS_2008, PL_08, FACTOR_NAMES, CAT_IDX)

# Save trained model
joblib.dump(RF_08, os.path.join(SAWAH_DIR, "RF_2008.pkl"))

# =============================================================
# 5) PREDICT PROBABILITIES & BUILD SUITABILITY MAP (ALL CLASSES)
# =============================================================

def predict_proba_batched(rf, X_flat, batch=500_000):
    """Predict probabilities in batches to avoid memory overflow."""
    out = []
    N = X_flat.shape[0]
    for i in range(0, N, batch):
        out.append(rf.predict_proba(X_flat[i:i + batch]))
    return np.vstack(out)


def build_suitability_map(rf, factors):
    """
    Build suitability probability maps for all classes.

    Parameters
    ----------
    rf : RandomForestClassifier
        Trained RF model.
    factors : np.ndarray
        Stacked predictor array.

    Returns
    -------
    suit : np.ndarray
        Suitability map for all classes (C, H, W).
    classes : np.ndarray
        List of class labels corresponding to RF output order.
    """
    H, W, B = factors.shape
    X = factors.reshape(-1, B).copy()

    # Replace NaN with 0 before prediction (categorical -1 remains)
    nanmask = ~np.isfinite(X)
    if nanmask.any():
        X[nanmask] = 0

    proba = predict_proba_batched(rf, X, batch=500_000)

    classes = np.array(sorted(rf.classes_))
    C = len(classes)
    suit = np.zeros((C, H * W), np.float32)

    # rf.predict_proba returns probabilities in rf.classes_ order (not sorted)
    for k, cls in enumerate(rf.classes_):
        ch = np.where(classes == cls)[0][0]
        suit[ch, :] = proba[:, k]

    return suit.reshape(C, H, W), classes


SUIT_2008_ALL, SUIT_CLASSES = build_suitability_map(RF_08, FACTORS_2008)

# Save full-class suitability maps
np.save(os.path.join(SAWAH_DIR, "SUIT_2008_all.npy"), SUIT_2008_ALL)
np.save(os.path.join(SAWAH_DIR, "SUIT_2008_all_classes.npy"), SUIT_CLASSES)
print("‚úÖ Suitability (all classes) saved to:", os.path.join(SAWAH_DIR, "SUIT_2008_all.npy"))

# =============================================================
# 6) AGGREGATE TO 3 CHANNELS: Paddy, Built-up, Others
# =============================================================

def aggregate_to_three_channels(suit_all, classes, groups_cfg):
    """
    Aggregate suitability probabilities into 3 main land-use groups.

    Parameters
    ----------
    suit_all : np.ndarray
        Full-class suitability (C, H, W).
    classes : np.ndarray
        Land-use class codes.
    groups_cfg : dict
        Group configuration, e.g., {"Paddy_Field": [1], "Built_Up_Land": [2], "Others": "rest"}.

    Returns
    -------
    suit3 : np.ndarray
        Aggregated suitability (3, H, W).
    names : list[str]
        Channel names in the same order as groups_cfg.
    """
    C, H, W = suit_all.shape
    cls_list = list(classes.tolist())

    names, stacks = [], []
    used_indices = set()

    for gname, members in groups_cfg.items():
        if members == "rest":
            idx = [i for i in range(C) if i not in used_indices]
        else:
            idx = [cls_list.index(m) for m in members if m in cls_list]
        if len(idx) == 0:
            stacks.append(np.zeros((H, W), dtype=np.float32))
        else:
            used_indices.update(idx)
            stacks.append(np.nansum(suit_all[idx, ...], axis=0).astype(np.float32))
        names.append(gname)

    suit3 = np.stack(stacks, axis=0).astype(np.float32)
    return suit3, names


SUIT_2008_3, SUIT3_NAMES = aggregate_to_three_channels(SUIT_2008_ALL, SUIT_CLASSES, GROUPS)

# Save 3-channel suitability arrays
np.save(os.path.join(SAWAH_DIR, "SUIT_2008_3ch.npy"), SUIT_2008_3)
np.save(os.path.join(SAWAH_DIR, "SUIT_2008_3ch_names.npy"), np.array(SUIT3_NAMES))
print("‚úÖ Suitability 3-channel (", SUIT3_NAMES, ") saved to:", os.path.join(SAWAH_DIR, "SUIT_2008_3ch.npy"))

# =============================================================
# 7) OPTIONAL ‚Äî SAVE AS GEOTIFF 3-BAND STACK
# =============================================================
"""
with rasterio.open(os.path.join(SAWAH_DIR, "PL_2008.tif")) as src_ref:
    prof = src_ref.profile.copy()
    prof.update(count=3, dtype='float32', nodata=np.float32(np.nan), compress='lzw')
    outp = os.path.join(SAWAH_DIR, "SUIT_2008_3ch.tif")
    with rasterio.open(outp, 'w', **prof) as dst:
        for i in range(3):
            dst.write(SUIT_2008_3[i].astype('float32'), i + 1)
        # Save band names as metadata
        dst.update_tags(1, name=SUIT3_NAMES[0])
        dst.update_tags(2, name=SUIT3_NAMES[1])
        dst.update_tags(3, name=SUIT3_NAMES[2])
    print("üó∫Ô∏è GeoTIFF 3-band file saved:", outp)
"""

# =============================================================
# 8) NOTES
# =============================================================
# - Ensure GROUPS match the class codes in PL_2008.
# - If codes differ (e.g., 10 = Paddy_Field, 20 = Built_Up), update the GROUPS dictionary.
# - The "Others" channel automatically sums unlisted classes if set to "rest".
# - The final GeoTIFF output will have 3 bands representing each land-use group.


# **Random Forest‚ÄìBased Suitability Mapping (2016‚Äì2024)**

In [None]:
# ===================== Script A ‚Äî Random Forest & Suitability (2016) ‚Äî 3-Channel =====================
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import numpy as np
import os

# (Optional) Only required when writing multi-band GeoTIFF output
# import rasterio

# =============================================================
# 1) LOAD DATA
# =============================================================
YEAR = 2016
PL_YR_PATH = os.path.join(SAWAH_DIR, f"PL_{YEAR}.tif")
PL_YR, PROFILE = read_raster(PL_YR_PATH)  # <-- TARGET (dependent variable)
print("Loaded:", PL_YR_PATH)

# Predictor factors selected from VIF results (keep identical to Script 2008 ‚Äî only the target year differs)
CAT_FACTORS  = ["KL.tif", "ZNT_Edit.tif"]  # categorical
CONT_FACTORS = [
    "Jalan_Arteri.tif", "Jalan_Kolektor.tif", "Jalan_Lokal.tif",
    "Distribusi_2016.tif"                   # same as Script 2008 (only year differs)
]

# =============================================================
# 2) 3-CHANNEL AGGREGATION CONFIGURATION
#    Mapping of ORIGINAL CLASSES (codes in PL_YEAR / rf.classes_) ‚Üí 3 groups
#    UPDATE the list below based on your legend.
#    Common example: 1=Paddy Field, 2=Built-up, others=Remaining
#    If unsure about class codes, run once to print rf.classes_.
# =============================================================
GROUPS = {
    "Sawah": [1],                # adjust according to the paddy field class code in PL
    "Lahan_Terbangun": [2],      # adjust according to built-up land class code in PL
    "Lainnya": "rest"            # remaining classes not listed above
}

# =============================================================
# 3) UTILITY FUNCTIONS
# =============================================================

def load_factors_stack(folder):
    bands = []; names = []; cat_idx = []
    # CATEGORICAL: NaN -> -1 (sentinel), then save as float32 (remain -1.0)
    for nm in CAT_FACTORS:
        a, _ = read_raster(os.path.join(folder, nm))
        a_cat = np.where(np.isfinite(a), a, -1).astype('float32')
        bands.append(a_cat); names.append(nm); cat_idx.append(len(bands)-1)
    # CONTINUOUS: normalize to 0‚Äì1 (keep NaN as-is)
    for nm in CONT_FACTORS:
        a, _ = read_raster(os.path.join(folder, nm))
        bands.append(norm01(a)); names.append(nm)
    return np.stack(bands, axis=-1), names, cat_idx  # (H, W, B)

FACTORS_YR, FACTOR_NAMES, CAT_IDX = load_factors_stack(SAWAH_DIR)

# =============================================================
# 4) RANDOM FOREST TRAINING
# =============================================================

def make_train_arrays(factors, target, cat_idx, nodata_class=255):
    """
    Prepare valid training samples for model fitting.
    """
    H, W, B = factors.shape
    X = factors.reshape(-1, B)
    y = target.reshape(-1)

    m = np.isfinite(y) & (y != nodata_class)          # valid target pixels
    for b in range(B):
        if b in cat_idx:
            m &= (np.isfinite(X[:, b]) & (X[:, b] != -1))  # categorical: exclude sentinel
        else:
            m &= np.isfinite(X[:, b])                      # continuous: must be finite

    return X[m], y[m].astype(int)


def tune_and_fit_rf(factors, target, factor_names, cat_idx, sample_size=50_000, seed=42):
    """
    Tune hyperparameters and fit the Random Forest classifier.
    """
    from numpy.random import RandomState
    X, y = make_train_arrays(factors, target, cat_idx)
    if len(y) > sample_size:
        rng = RandomState(seed)
        idx = rng.choice(len(y), size=sample_size, replace=False)
        X_sub, y_sub = X[idx], y[idx]
    else:
        X_sub, y_sub = X, y

    param_dist = {
        "n_estimators":      [100, 150, 200, 250, 300],
        "max_depth":         [20, 30, 40, None],
        "min_samples_split": [2, 5, 10, 20],
        "min_samples_leaf":  [1, 2, 4, 10],
        "max_features":      ['sqrt', 'log2', None]
    }
    base = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=seed)
    search = RandomizedSearchCV(base, param_distributions=param_dist, n_iter=10, cv=3,
                                random_state=seed, verbose=1, n_jobs=-1)
    search.fit(X_sub, y_sub)
    rf = search.best_estimator_
    print("Best parameters:", search.best_params_)
    rf.fit(X, y)
    print("\nRF report (full training):\n", classification_report(y, rf.predict(X)))
    print("Trained classes (rf.classes_):", rf.classes_)
    return rf

RF_YR = tune_and_fit_rf(FACTORS_YR, PL_YR, FACTOR_NAMES, CAT_IDX)

# Save trained model
joblib.dump(RF_YR, os.path.join(SAWAH_DIR, f"RF_{YEAR}.pkl"))

# =============================================================
# 5) PREDICT PROBABILITIES & BUILD SUITABILITY MAPS (ALL CLASSES)
# =============================================================

def predict_proba_batched(rf, X_flat, batch=500_000):
    out = []; N = X_flat.shape[0]
    for i in range(0, N, batch):
        out.append(rf.predict_proba(X_flat[i:i+batch]))
    return np.vstack(out)


def build_suitability_map(rf, factors):
    """
    Build suitability probability maps for all land-use classes.
    """
    H, W, B = factors.shape
    X = factors.reshape(-1, B).copy()
    # Fill NaN with 0 before prediction (categorical values with -1 remain unchanged)
    nanmask = ~np.isfinite(X)
    if nanmask.any():
        X[nanmask] = 0
    proba = predict_proba_batched(rf, X, batch=500_000)

    classes = np.array(sorted(rf.classes_))
    C = len(classes)
    suit = np.zeros((C, H * W), np.float32)
    # rf.predict_proba returns probabilities in rf.classes_ order (not sorted)
    for k, cls in enumerate(rf.classes_):
        ch = np.where(classes == cls)[0][0]
        suit[ch, :] = proba[:, k]
    return suit.reshape(C, H, W), classes

SUIT_YR_ALL, SUIT_CLASSES = build_suitability_map(RF_YR, FACTORS_YR)

# Save suitability maps (NPY) ‚Äî all classes
np.save(os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_all.npy"), SUIT_YR_ALL)
np.save(os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_all_classes.npy"), SUIT_CLASSES)
print("‚úÖ Suitability (all classes) saved:", os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_all.npy"))

# =============================================================
# 6) AGGREGATE INTO 3 CHANNELS: Sawah, Lahan Terbangun, Lainnya
#    Group-level probability = sum of probabilities of member classes
# =============================================================

def aggregate_to_three_channels(suit_all, classes, groups_cfg):
    """\
    suit_all : (C,H,W) float32   ‚Äî suitability for all classes
    classes  : (C,)              ‚Äî class values (integer codes from PL)
    groups_cfg: dict             ‚Äî mapping {"GroupName": [codes...]} or "rest"
    Returns
    -------
    suit3 : (3,H,W) float32  ‚Äî channels follow the order of group_cfg keys
    names : list[str]        ‚Äî channel names
    """
    C, H, W = suit_all.shape
    cls_list = list(classes.tolist())

    names = []; stacks = []; used_indices = set()
    for gname, members in groups_cfg.items():
        if members == "rest":
            idx = [i for i in range(C) if i not in used_indices]
        else:
            idx = [cls_list.index(m) for m in members if m in cls_list]
        if len(idx) == 0:
            stacks.append(np.zeros((H, W), dtype=np.float32))
        else:
            used_indices.update(idx)
            stacks.append(np.nansum(suit_all[idx, ...], axis=0).astype(np.float32))
        names.append(gname)

    suit3 = np.stack(stacks, axis=0).astype(np.float32)
    return suit3, names

SUIT_YR_3, SUIT3_NAMES = aggregate_to_three_channels(SUIT_YR_ALL, SUIT_CLASSES, GROUPS)

# Save 3-channel NPY (order follows SUIT3_NAMES)
np.save(os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_3ch.npy"), SUIT_YR_3)
np.save(os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_3ch_names.npy"), np.array(SUIT3_NAMES))
print("‚úÖ Suitability 3-channel (", SUIT3_NAMES, ") saved:", os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_3ch.npy"))

# =============================================================
# 7) (OPTIONAL) SAVE AS 3-BAND GEOTIFF (STACKED)
#    Band-1 = Sawah, Band-2 = Lahan_Terbangun, Band-3 = Lainnya
#    Activate by uncommenting the block below
# =============================================================
"""
with rasterio.open(os.path.join(SAWAH_DIR, f"PL_{YEAR}.tif")) as src_ref:
    prof = src_ref.profile.copy()
    prof.update(count=3, dtype='float32', nodata=np.float32(np.nan), compress='lzw')
    outp = os.path.join(SAWAH_DIR, f"SUIT_{YEAR}_3ch.tif")
    with rasterio.open(outp, 'w', **prof) as dst:
        for i in range(3):
            dst.write(SUIT_YR_3[i].astype('float32'), i+1)
        # Save band names as metadata
        dst.update_tags(1, name=SUIT3_NAMES[0])
        dst.update_tags(2, name=SUIT3_NAMES[1])
        dst.update_tags(3, name=SUIT3_NAMES[2])
    print("üó∫Ô∏è GeoTIFF 3-band file saved:", outp)
"""

# =============================================================
# 8) NOTES
# - Structure, functions, and factors are identical to Script 2008.
# - The only difference is using the 2016 target map, and outputs are named according to YEAR.
# - If changing the yearly factor (e.g., Distribusi_2016.tif), manually update CONT_FACTORS to keep consistency across years.
