In [2]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
from google.colab import output
output.enable_custom_widget_manager()

In [4]:
!pip -q install ipywidgets==8
!pip -q install tqdm tqdm-joblib

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/133.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m1.4/2.2 MB[0m [31m43.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:

# 1) Imports
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterGrid
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib
from sklearn.metrics import classification_report, confusion_matrix

def num_candidates(grid):
    return sum(len(ParameterGrid(g)) for g in grid) if isinstance(grid, list) else len(ParameterGrid(grid))

warnings.filterwarnings("ignore", category=FutureWarning)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
DATA_PATH = Path("drive/MyDrive/forestCover.csv")
RESULTS_DIR = Path("drive/MyDrive/ML Assign 2"); RESULTS_DIR.mkdir(exist_ok=True)

# 2) Load and basic cleaning
df = pd.read_csv(DATA_PATH, na_values=["?"])
target_col = df.columns[-1]

# swap Water_Level and Observation_ID
if {"Water_Level", "Observation_ID"}.issubset(df.columns):
    observation = df["Water_Level"]
    df["Water_Level"] = df["Observation_ID"]
    df["Observation_ID"] = observation

# map Soil_Type1 if text labels exist
if "Soil_Type1" in df.columns:
    df["Soil_Type1"] = (
        df["Soil_Type1"].replace({"positive": 1, "negative": 0}).astype("float32")
    )

# drop known unusable columns if present
drop_if_present = ["Water_Level", "Observation_ID", "Inclination", "Aspect"]
df.drop(columns=[c for c in drop_if_present if c in df.columns], inplace=True)

# remove obvious outliers
if "Horizontal_Distance_To_Hydrology" in df.columns:
    bad = df["Horizontal_Distance_To_Hydrology"] > 10000
    if int(bad.sum()) > 0:
        df = df.loc[~bad].copy()

# 3) Feature lists
binary_cols_all = [
 'Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4',
 'Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6',
 'Soil_Type7','Soil_Type8','Soil_Type9','Soil_Type10','Soil_Type11','Soil_Type12',
 'Soil_Type13','Soil_Type14','Soil_Type15','Soil_Type16','Soil_Type17','Soil_Type18',
 'Soil_Type19','Soil_Type20','Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24',
 'Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30',
 'Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36',
 'Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40'
]
feature_cols = [c for c in df.columns if c != target_col]
binary_cols = [c for c in binary_cols_all if c in feature_cols]
numeric_cols = [c for c in feature_cols if c not in binary_cols]

# 4) Train test split
X = df[feature_cols].copy()
y = df[target_col].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)


In [6]:
gower_bin_idx = np.arange(len(numeric_cols), len(numeric_cols) + len(binary_cols), dtype=int)

def gower_distance(u, v, *, bin_idx):
    d = np.abs(u - v)
    if bin_idx.size:
        b = bin_idx
        d[b] = (u[b] != v[b]).astype(float)
    return float(d.mean())

In [7]:
def dump_gridsearch(gs, X_test=None, y_test=None, name="gs"):
    # basics
    print("refit:", gs.refit)
    print("n_splits_:", getattr(gs, "n_splits_", None))
    print("n_candidates:", len(gs.cv_results_["params"]))
    print("best_index_:", gs.best_index_)
    print("best_params_:", gs.best_params_)
    print("refit_time_:", getattr(gs, "refit_time_", None))

    # available scorers
    scorers = sorted({k.split("mean_test_")[1] for k in gs.cv_results_ if k.startswith("mean_test_")})
    for s in scorers:
        print(f"best mean {s}:", gs.cv_results_[f"mean_test_{s}"][gs.best_index_])

    # full cv table (sorted by refit scorer), save to disk
    cols_core = ["params","mean_fit_time","std_fit_time","mean_score_time","std_score_time"]
    cols_scores = sum([[f"mean_test_{s}", f"std_test_{s}", f"rank_test_{s}"] for s in scorers], [])
    cols_splits = [c for c in gs.cv_results_.keys() if c.startswith("split") and any(c.endswith(f"test_{s}") for s in scorers)]
    cols = [c for c in cols_core + cols_scores + cols_splits if c in gs.cv_results_]

    df = pd.DataFrame(gs.cv_results_)[cols]
    sort_key = f"rank_test_{gs.refit}" if isinstance(gs.refit, str) else f"rank_test_{scorers[0]}"
    df = df.sort_values(sort_key, ascending=True)
    display(df.head(20))
    out_csv = RESULTS_DIR / f"{name}_cv_results.csv"
    df.to_csv(out_csv, index=False)
    print("saved:", out_csv)

    # best estimator object and its full param dict
    print("\nbest_estimator_:\n", gs.best_estimator_)
    print("\nbest_estimator_.get_params():")
    for k, v in sorted(gs.best_estimator_.get_params(deep=True).items()):
        print(f"  {k}: {v}")

    # optional test-set report
    if X_test is not None and y_test is not None:
        y_pred = gs.predict(X_test)
        print("\nTest classification_report:\n", classification_report(y_test, y_pred))
        print("\nTest confusion_matrix:\n", confusion_matrix(y_test, y_pred))


In [8]:

numeric_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", MinMaxScaler())
])

binary_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_cols),
        ("bin", binary_pipe, binary_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

pipe = Pipeline([
    ("pre", preprocess),
    ("clf", KNeighborsClassifier(algorithm="brute", n_jobs=os.cpu_count()))
])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

clf = KNeighborsClassifier(n_jobs=os.cpu_count())

In [9]:
# grid_all = [
#     {
#         "clf__n_neighbors": [3, 5, 7, 10],
#         "clf__weights": ["distance"],
#         "clf__metric": ["euclidean", "manhattan", "cosine"],
#     },
#     {
#         "clf__n_neighbors": [3, 5, 7, 10],
#         "clf__weights": ["distance"],
#         "clf__metric": [gower_distance],
#         "clf__metric_params": [{"bin_idx": gower_bin_idx}]
#     }
# ]

grid_all = {
        "clf__n_neighbors": [3, 5, 7, 10],
        "clf__weights": ["distance"],
        "clf__metric": ["euclidean", "manhattan"],
    }

grid_3 = {
        "clf__n_neighbors": [3],
        "clf__weights": ["distance"],
        "clf__metric": ["euclidean", "manhattan", "cosine", "minkowski"],
    }

grid_5 = [
    {
        "clf__n_neighbors": [5],
        "clf__weights": ["distance"],
        "clf__metric": ["euclidean", "manhattan", "cosine"],
    },
    {
        "clf__n_neighbors": [5],
        "clf__weights": ["distance"],
        "clf__metric": [gower_distance],
        "clf__metric_params": [{"bin_idx": gower_bin_idx}]
    }
]

grid_7 = [
    {
        "clf__n_neighbors": [7],
        "clf__weights": ["distance"],
        "clf__metric": ["euclidean", "manhattan", "cosine"],
    },
    {
        "clf__n_neighbors": [7],
        "clf__weights": ["distance"],
        "clf__metric": [gower_distance],
        "clf__metric_params": [{"bin_idx": gower_bin_idx}]
    }
]

grid_10 = [
    {
        "clf__n_neighbors": [10],
        "clf__weights": ["distance"],
        "clf__metric": ["euclidean", "manhattan", "cosine"],
    },
    {
        "clf__n_neighbors": [10],
        "clf__weights": ["distance"],
        "clf__metric": [gower_distance],
        "clf__metric_params": [{"bin_idx": gower_bin_idx}]
    }
]

grid_one = {
        "clf__n_neighbors": [3],
        "clf__weights": ["distance"],
        "clf__metric": ["euclidean"]
    }

In [10]:


gs_one = GridSearchCV(
    pipe,
    param_grid=grid_one,
    cv=cv,
    scoring={"f1_macro": "f1_macro", "balanced_acc": "balanced_accuracy", "mcc": "matthews_corrcoef"},
    refit="balanced_acc",
    n_jobs=os.cpu_count(),
    pre_dispatch="2*n_jobs",
    verbose=3
)

gs_all = GridSearchCV(
    pipe,
    param_grid=grid_all,
    cv=cv,
    scoring={"f1_macro": "f1_macro", "balanced_acc": "balanced_accuracy", "mcc": "matthews_corrcoef"},
    refit="balanced_acc",
    n_jobs=os.cpu_count(),
    pre_dispatch="2*n_jobs",
    verbose=3
)

gs_3 = GridSearchCV(
    pipe,
    param_grid=grid_3,
    cv=cv,
    scoring={"f1_macro": "f1_macro", "balanced_acc": "balanced_accuracy", "mcc": "matthews_corrcoef"},
    refit="balanced_acc",
    n_jobs=os.cpu_count(),
    pre_dispatch="2*n_jobs",
    verbose=3
)

gs_5 = GridSearchCV(
    pipe,
    param_grid=grid_5,
    cv=cv,
    scoring={"f1_macro": "f1_macro", "balanced_acc": "balanced_accuracy", "mcc": "matthews_corrcoef"},
    refit="balanced_acc",
    n_jobs=os.cpu_count(),
    pre_dispatch="2*n_jobs",
    verbose=3
)

gs_7 = GridSearchCV(
    pipe,
    param_grid=grid_7,
    cv=cv,
    scoring={"f1_macro": "f1_macro", "balanced_acc": "balanced_accuracy", "mcc": "matthews_corrcoef"},
    refit="balanced_acc",
    n_jobs=os.cpu_count(),
    pre_dispatch="2*n_jobs",
    verbose=3
)

gs_10 = GridSearchCV(
    pipe,
    param_grid=grid_10,
    cv=cv,
    scoring={"f1_macro": "f1_macro", "balanced_acc": "balanced_accuracy", "mcc": "matthews_corrcoef"},
    refit="balanced_acc",
    n_jobs=os.cpu_count(),
    pre_dispatch="2*n_jobs",
    verbose=3
)


In [11]:
gs_all.fit(X_train, y_train)
dump_gridsearch(gs_all, name="gs KNN all")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
refit: balanced_acc
n_splits_: 3
n_candidates: 8
best_index_: 4
best_params_: {'clf__metric': 'manhattan', 'clf__n_neighbors': 3, 'clf__weights': 'distance'}
refit_time_: 0.9291141033172607
best mean balanced_acc: 0.8865043785509417
best mean f1_macro: 0.8927259360971617
best mean mcc: 0.897103449841827


Unnamed: 0,params,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_balanced_acc,std_test_balanced_acc,rank_test_balanced_acc,mean_test_f1_macro,std_test_f1_macro,...,rank_test_mcc,split0_test_f1_macro,split1_test_f1_macro,split2_test_f1_macro,split0_test_balanced_acc,split1_test_balanced_acc,split2_test_balanced_acc,split0_test_mcc,split1_test_mcc,split2_test_mcc
4,"{'clf__metric': 'manhattan', 'clf__n_neighbors...",1.132662,0.105709,1682.324007,0.83337,0.886504,0.000289,1,0.892726,0.000237,...,1,0.892431,0.892737,0.89301,0.886448,0.886883,0.886182,0.896983,0.896779,0.897548
5,"{'clf__metric': 'manhattan', 'clf__n_neighbors...",1.330565,0.116893,1707.24076,28.383049,0.882148,0.001239,2,0.891553,0.001101,...,2,0.893059,0.89114,0.89046,0.883811,0.881796,0.880838,0.89654,0.895182,0.896766
0,"{'clf__metric': 'euclidean', 'clf__n_neighbors...",1.06627,0.078482,210.949702,0.604171,0.881594,0.000758,3,0.887947,0.000958,...,3,0.889231,0.88693,0.88768,0.882521,0.880665,0.881597,0.89324,0.891983,0.894095
6,"{'clf__metric': 'manhattan', 'clf__n_neighbors...",1.2145,0.053379,1684.169365,1.490921,0.876108,0.001445,4,0.887889,0.00095,...,4,0.889191,0.887525,0.88695,0.878037,0.875731,0.874557,0.893454,0.892105,0.893423
1,"{'clf__metric': 'euclidean', 'clf__n_neighbors...",1.150578,0.066597,235.774982,34.237379,0.875833,0.001399,5,0.885092,0.001866,...,5,0.88679,0.885991,0.882493,0.877525,0.875875,0.874099,0.890797,0.889208,0.890334
2,"{'clf__metric': 'euclidean', 'clf__n_neighbors...",1.075237,0.149611,235.370877,35.02305,0.868709,0.001741,6,0.880593,0.001544,...,7,0.882757,0.879253,0.87977,0.871122,0.867082,0.867921,0.886636,0.884841,0.886975
7,"{'clf__metric': 'manhattan', 'clf__n_neighbors...",1.382674,0.112384,1706.44493,29.753322,0.867722,0.001647,7,0.882399,0.001333,...,6,0.884047,0.882367,0.880783,0.869945,0.867214,0.866007,0.888783,0.888271,0.888892
3,"{'clf__metric': 'euclidean', 'clf__n_neighbors...",1.288496,0.026771,211.862007,0.749721,0.859453,0.00263,8,0.874513,0.002011,...,8,0.877304,0.873591,0.872645,0.863169,0.857481,0.857708,0.881585,0.879464,0.881077


saved: drive/MyDrive/ML Assign 2/gs KNN all_cv_results.csv

best_estimator_:
 Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scale',
                                                                   MinMaxScaler())]),
                                                  ['Elevation', 'Facet',
                                                   'Slope',
                                                   'Horizontal_Distance_To_Hydrology',
                                                   'Vertical_Distance_To_Hydrology',
                                                   'Horizontal_Distance_To_Roadways',
                                                   'Hillshade_9am',
                                         

In [13]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report
)

In [14]:
best_model = gs_all.best_estimator_

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
    "precision_macro": "precision_macro",
    "recall_macro": "recall_macro",
}
cv_res = cross_validate(best_model, X_train, y_train, cv=cv, scoring=scoring)

print("Cross-validation results (mean ± std over 5 folds)")
for k in scoring.keys():
    vals = cv_res[f"test_{k}"]
    print(f"{k}: {vals.mean():.3f} ± {vals.std():.3f}")

Cross-validation results (mean ± std over 5 folds)
accuracy: 0.940 ± 0.000
f1_macro: 0.898 ± 0.003
precision_macro: 0.905 ± 0.003
recall_macro: 0.893 ± 0.002


In [15]:

# 2) Fit on full training and evaluate on test set
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test) if hasattr(best_model, "predict_proba") else None

acc = accuracy_score(y_test, y_pred)
f1m = f1_score(y_test, y_pred, average="macro")
prec_m = precision_score(y_test, y_pred, average="macro", zero_division=0)
rec_m = recall_score(y_test, y_pred, average="macro")

roc = None
if y_proba is not None:
    n_classes = len(np.unique(y_train))
    if n_classes == 2:
        roc = roc_auc_score(y_test, y_proba[:, 1])
    else:
        roc = roc_auc_score(y_test, y_proba, multi_class="ovr")

cm = confusion_matrix(y_test, y_pred)

print("\nTest set results")
print(f"accuracy: {acc:.3f}")
print(f"f1_macro: {f1m:.3f}")
print(f"precision_macro: {prec_m:.3f}")
print(f"recall_macro: {rec_m:.3f}")
if roc is not None:
    print(f"roc_auc: {roc:.3f}")

print("\nConfusion matrix")
print(cm)

print("\nClassification report")
print(classification_report(y_test, y_pred, digits=3, zero_division=0))


Test set results
accuracy: 0.944
f1_macro: 0.905
precision_macro: 0.909
recall_macro: 0.901
roc_auc: 0.979

Confusion matrix
[[39919  2230     5     0    29     7   177]
 [ 2053 54118   130     2   207   116    32]
 [    4   126  6678    55    12   276     0]
 [    0     0    66   446     0    37     0]
 [   35   243    13     0  1595    11     1]
 [    9   126   269    23     2  3044     0]
 [  181    31     0     0     0     0  3890]]

Classification report
              precision    recall  f1-score   support

           1      0.946     0.942     0.944     42367
           2      0.952     0.955     0.953     56658
           3      0.933     0.934     0.933      7151
           4      0.848     0.812     0.830       549
           5      0.864     0.840     0.852      1898
           6      0.872     0.876     0.874      3473
           7      0.949     0.948     0.949      4102

    accuracy                          0.944    116198
   macro avg      0.909     0.901     0.905    