In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import yaml

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score, balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression


pd.set_option("display.max_columns", 200)
np.random.seed(42)


In [None]:
# set the model paths
ROOT = Path("..")
CFG_PATH = ROOT / "configs" / "config.yaml"
MODEL_PATH = ROOT / "data" / "processed" / "model_table.csv"

CFG_PATH.exists(), MODEL_PATH.exists()


In [None]:
# Read the configuration file
cfg = yaml.safe_load(CFG_PATH.read_text())
cfg

In [None]:
# load the model table in notebook 01
df = pd.read_csv(MODEL_PATH)
df.shape, df.columns.tolist()

# Quick sanity check on key fields
df[["ph", "depth", "longitude", "latitude", "y"]].describe(include="all")


In [None]:
# check the class balance on the dataset
df["y"] = pd.to_numeric(df["y"], errors="coerce").astype(int)
df["y"].value_counts()


In [None]:
# review the coordinate ranges in the dataset
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
lat_min, lat_max = df["latitude"].min(), df["latitude"].max()

(lon_min, lon_max, lat_min, lat_max)


In [None]:
# spatial CV needs works well with aggregated data 'units' so nearby wells info does not end up in both train and test
# hence lets creare a grid block .
GRID_SIZE = cfg["spatial_cv"]["grid_size_m"]  # default 5000

df["block_x"] = np.floor(df["longitude"] / GRID_SIZE).astype(int)
df["block_y"] = np.floor(df["latitude"] / GRID_SIZE).astype(int)
df["block_id"] = df["block_x"].astype(str) + "_" + df["block_y"].astype(str)

df["block_id"].nunique(), df["block_id"].value_counts().head(10)


In [None]:
# Baseline model is a logistic regression with pH, depth, and geology unit
# reference point before adding spatial features or other models
num_features = ["ph", "depth"]
cat_features = ["geology_unit"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median")),
        ]), num_features),
        ("cat", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]), cat_features),
    ],
    remainder="drop",
)

baseline_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=500, class_weight="balanced")),
])
