In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [3]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
DATA_PATH = Path("drive/MyDrive/forestCover.csv")  # set this if needed
RESULTS_DIR = Path("drive/MyDrive/ML Assign 2"); RESULTS_DIR.mkdir(exist_ok=True)

In [4]:

df = pd.read_csv(DATA_PATH, na_values=["?"])
target_col = df.columns[-1]
observation = df['Water_Level']
df['Water_Level'] = df['Observation_ID']
df['Observation_ID'] = observation

df["Soil_Type1"] = df["Soil_Type1"].replace({"positive": 1, "negative": 0}).astype(int)

drop_if_present = ["Water_Level", "Observation_ID", "Inclination", "Aspect"]
to_drop = [c for c in drop_if_present if c in df.columns]
df.drop(columns=to_drop, inplace=True)

feature_cols = [c for c in df.columns if c != target_col]

binary_cols = [
 'Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4',
 'Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6',
 'Soil_Type7','Soil_Type8','Soil_Type9','Soil_Type10','Soil_Type11','Soil_Type12',
 'Soil_Type13','Soil_Type14','Soil_Type15','Soil_Type16','Soil_Type17','Soil_Type18',
 'Soil_Type19','Soil_Type20','Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24',
 'Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30',
 'Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36',
 'Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40'
]

numeric_cols = [c for c in feature_cols if c not in binary_cols]

binary_mask = np.array([col in binary_cols for col in feature_cols], dtype=bool)

if "Horizontal_Distance_To_Hydrology" in df.columns:
    bad = df["Horizontal_Distance_To_Hydrology"] > 10000
    n_bad = int(bad.sum())
    if n_bad:
        print(n_bad)
        df = df.loc[~bad].copy()

def compress_features(df, allow_sparse=True):
    df = df.copy()

    # separate target if present
    y = None
    if target_col and target_col in df.columns:
        y = df.pop(target_col)

    # numeric columns = everything not in binary list
    num_cols = [c for c in df.columns if c not in binary_cols]

    # downcast numerics to float32
    df[num_cols] = df[num_cols].astype(np.float32)

    # binaries to very small dtypes
    if allow_sparse:
        # best when binaries are mostly zeros
        for c in binary_cols:
            df[c] = pd.arrays.SparseArray(df[c].astype("uint8"), fill_value=0)
    else:
        if df[binary_cols].isna().any().any():
            # nullable boolean if you truly have NaNs in binaries
            for c in binary_cols:
                df[c] = df[c].astype(pd.BooleanDtype())
        else:
            df[binary_cols] = df[binary_cols].astype("uint8")

    # optional: compact target
    if y is not None:
        if pd.api.types.is_integer_dtype(y):
            y = y.astype("int8")  # 7 classes fit in int8
        else:
            y = y.astype("category")

    return df, y


X, y = compress_features(df)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

df.head()

  df["Soil_Type1"] = df["Soil_Type1"].replace({"positive": 1, "negative": 0}).astype(int)


25


Unnamed: 0,Elevation,Facet,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,3208920,415.394727,6.0,408,62,3137,225,242,151,2366,...,0,0,0,0,0,0,0,0,0,2
1,2789020,343.302186,9.0,30,2,1040,235,237,133,1804,...,0,0,0,0,0,0,0,0,0,3
2,3384615,894.23139,9.0,362,15,3113,207,225,156,295,...,0,0,0,0,0,0,0,0,0,1
3,3348150,371.346939,6.0,247,50,1207,228,240,145,2405,...,0,0,0,0,0,0,0,0,0,2
4,3061955,310.78343,11.0,170,19,1595,238,232,124,2837,...,0,0,0,0,0,0,0,0,0,2


In [6]:
X_train.shape

(464789, 54)

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

def dump_gridsearch(gs, X_test=None, y_test=None, name="gs"):
    # basics
    print("refit:", gs.refit)
    print("n_splits_:", getattr(gs, "n_splits_", None))
    print("n_candidates:", len(gs.cv_results_["params"]))
    print("best_index_:", gs.best_index_)
    print("best_params_:", gs.best_params_)
    print("refit_time_:", getattr(gs, "refit_time_", None))

    # available scorers
    scorers = sorted({k.split("mean_test_")[1] for k in gs.cv_results_ if k.startswith("mean_test_")})
    for s in scorers:
        print(f"best mean {s}:", gs.cv_results_[f"mean_test_{s}"][gs.best_index_])

    # full cv table (sorted by refit scorer), save to disk
    cols_core = ["params","mean_fit_time","std_fit_time","mean_score_time","std_score_time"]
    cols_scores = sum([[f"mean_test_{s}", f"std_test_{s}", f"rank_test_{s}"] for s in scorers], [])
    cols_splits = [c for c in gs.cv_results_.keys() if c.startswith("split") and any(c.endswith(f"test_{s}") for s in scorers)]
    cols = [c for c in cols_core + cols_scores + cols_splits if c in gs.cv_results_]

    df = pd.DataFrame(gs.cv_results_)[cols]
    sort_key = f"rank_test_{gs.refit}" if isinstance(gs.refit, str) else f"rank_test_{scorers[0]}"
    df = df.sort_values(sort_key, ascending=True)
    display(df.head(20))
    out_csv = RESULTS_DIR / f"{name}_cv_results.csv"
    df.to_csv(out_csv, index=False)
    print("saved:", out_csv)

    # best estimator object and its full param dict
    print("\nbest_estimator_:\n", gs.best_estimator_)
    print("\nbest_estimator_.get_params():")
    for k, v in sorted(gs.best_estimator_.get_params(deep=True).items()):
        print(f"  {k}: {v}")

    # optional test-set report
    if X_test is not None and y_test is not None:
        y_pred = gs.predict(X_test)
        print("\nTest classification_report:\n", classification_report(y_test, y_pred))
        print("\nTest confusion_matrix:\n", confusion_matrix(y_test, y_pred))


In [5]:
clf = DecisionTreeClassifier(
  random_state=42,
  class_weight="balanced"
)

pipe = Pipeline([
    ("clf", clf)
])

param_grid = {
    "clf__criterion": ["gini", "entropy"],
    "clf__ccp_alpha": np.concatenate([[0.0], np.logspace(-5, -1, 9)])
}

In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring={"bal_acc": "balanced_accuracy", "f1_macro": "f1_macro", "mcc": "matthews_corrcoef"},
    refit="bal_acc",
    cv=cv,
    n_jobs=-1,
    verbose=3
)

gs.fit(X_train, y_train)
print("Best params:", gs.best_params_)
best_model = gs.best_estimator_
from sklearn.metrics import classification_report, balanced_accuracy_score
y_pred = best_model.predict(X_test)
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best params: {'clf__ccp_alpha': np.float64(1e-05), 'clf__criterion': 'gini'}




Balanced accuracy: 0.8985872049918255
              precision    recall  f1-score   support

           1       0.88      0.89      0.88     42367
           2       0.92      0.87      0.90     56658
           3       0.87      0.92      0.90      7151
           4       0.80      0.83      0.81       549
           5       0.59      0.91      0.72      1898
           6       0.78      0.90      0.83      3473
           7       0.84      0.97      0.90      4102

    accuracy                           0.89    116198
   macro avg       0.81      0.90      0.85    116198
weighted avg       0.89      0.89      0.89    116198



In [None]:
dump_gridsearch(gs, X_test=X_test, y_test=y_test)

refit: bal_acc
n_splits_: 3
n_candidates: 20
best_index_: 2
best_params_: {'clf__ccp_alpha': np.float64(1e-05), 'clf__criterion': 'gini'}
refit_time_: 20.3207106590271
best mean bal_acc: 0.8838620104933713
best mean f1_macro: 0.8445909927869407
best mean mcc: 0.8189970340572134


Unnamed: 0,params,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_bal_acc,std_test_bal_acc,rank_test_bal_acc,mean_test_f1_macro,std_test_f1_macro,...,rank_test_mcc,split0_test_bal_acc,split1_test_bal_acc,split2_test_bal_acc,split0_test_f1_macro,split1_test_f1_macro,split2_test_f1_macro,split0_test_mcc,split1_test_mcc,split2_test_mcc
2,"{'clf__ccp_alpha': 1e-05, 'clf__criterion': 'g...",20.204511,0.800866,0.366821,0.017505,0.883862,0.001498,1,0.844591,0.004318,...,4,0.884051,0.88194,0.885595,0.849362,0.838904,0.845507,0.823913,0.815573,0.817504
3,"{'clf__ccp_alpha': 1e-05, 'clf__criterion': 'e...",19.448222,0.215675,0.447056,0.121241,0.880852,0.002137,2,0.877845,0.002666,...,3,0.882565,0.877839,0.882153,0.880132,0.874106,0.879296,0.863671,0.864687,0.860486
5,"{'clf__ccp_alpha': 3.1622776601683795e-05, 'cl...",20.058663,0.175303,0.350783,0.00208,0.878914,0.001774,3,0.844412,0.002204,...,5,0.881286,0.877021,0.878435,0.847505,0.8432,0.842531,0.79205,0.794722,0.780685
1,"{'clf__ccp_alpha': 0.0, 'clf__criterion': 'ent...",16.217503,0.208418,0.379616,0.009312,0.878444,0.002487,4,0.883444,0.002313,...,1,0.880434,0.874938,0.879961,0.885222,0.880178,0.884932,0.88284,0.884376,0.880691
4,"{'clf__ccp_alpha': 3.1622776601683795e-05, 'cl...",21.089668,0.555252,0.462124,0.078918,0.876524,0.00303,5,0.782163,0.003916,...,6,0.879399,0.872335,0.87784,0.784943,0.776626,0.784921,0.726749,0.719157,0.72137
0,"{'clf__ccp_alpha': 0.0, 'clf__criterion': 'gini'}",24.859075,6.322295,0.528696,0.118935,0.869831,0.002413,6,0.877031,0.003397,...,2,0.871256,0.866434,0.871804,0.880393,0.872378,0.878321,0.877317,0.873294,0.8765
7,"{'clf__ccp_alpha': 0.0001, 'clf__criterion': '...",19.748712,1.123522,0.444827,0.120744,0.864305,0.001498,7,0.767536,0.002078,...,7,0.865844,0.862276,0.864796,0.765176,0.767202,0.770232,0.673271,0.674725,0.668725
6,"{'clf__ccp_alpha': 0.0001, 'clf__criterion': '...",21.300575,0.766107,0.332082,0.005281,0.845509,0.00446,8,0.693396,0.00931,...,8,0.847525,0.839325,0.849677,0.699122,0.680265,0.700801,0.614827,0.602739,0.610892
9,"{'clf__ccp_alpha': 0.00031622776601683794, 'cl...",20.720768,1.346342,0.36251,0.042349,0.827127,0.00052,9,0.661626,0.004759,...,9,0.827584,0.8264,0.827396,0.654904,0.664697,0.665276,0.551012,0.557064,0.559534
8,"{'clf__ccp_alpha': 0.00031622776601683794, 'cl...",20.679276,0.735381,0.32593,0.011662,0.79169,0.006429,10,0.600381,0.007581,...,10,0.794116,0.782889,0.798064,0.604521,0.589747,0.606876,0.507512,0.50295,0.510609


saved: drive/MyDrive/ML Assign 2/gs_cv_results.csv

best_estimator_:
 Pipeline(steps=[('clf',
                 DecisionTreeClassifier(ccp_alpha=np.float64(1e-05),
                                        class_weight='balanced',
                                        random_state=42))])

best_estimator_.get_params():
  clf: DecisionTreeClassifier(ccp_alpha=np.float64(1e-05), class_weight='balanced',
                       random_state=42)
  clf__ccp_alpha: 1e-05
  clf__class_weight: balanced
  clf__criterion: gini
  clf__max_depth: None
  clf__max_features: None
  clf__max_leaf_nodes: None
  clf__min_impurity_decrease: 0.0
  clf__min_samples_leaf: 1
  clf__min_samples_split: 2
  clf__min_weight_fraction_leaf: 0.0
  clf__monotonic_cst: None
  clf__random_state: 42
  clf__splitter: best
  memory: None
  steps: [('clf', DecisionTreeClassifier(ccp_alpha=np.float64(1e-05), class_weight='balanced',
                       random_state=42))]
  transform_input: None
  verbose: False





Test classification_report:
               precision    recall  f1-score   support

           1       0.88      0.89      0.88     42367
           2       0.92      0.87      0.90     56658
           3       0.87      0.92      0.90      7151
           4       0.80      0.83      0.81       549
           5       0.59      0.91      0.72      1898
           6       0.78      0.90      0.83      3473
           7       0.84      0.97      0.90      4102

    accuracy                           0.89    116198
   macro avg       0.81      0.90      0.85    116198
weighted avg       0.89      0.89      0.89    116198


Test confusion_matrix:
 [[37693  3900    15     0   108    22   629]
 [ 5055 49391   593     2  1067   444   106]
 [    1    82  6579    89    23   377     0]
 [    0     0    57   456     0    36     0]
 [   21   115    14     0  1731    15     2]
 [    0    56   261    24     9  3123     0]
 [  133     3     0     0     0     0  3966]]


In [8]:
# Additional Analysis for Decision Tree Results
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (confusion_matrix, classification_report,
                           balanced_accuracy_score, matthews_corrcoef,
                           precision_recall_fscore_support, roc_curve, auc)
from sklearn.tree import export_text, plot_tree
import scipy.stats as stats


In [9]:
best_model = DecisionTreeClassifier(
  random_state=42,
  class_weight="balanced",
  ccp_alpha=1e-05,
  criterion='gini'
)

best_model.fit(X_train, y_train)



In [12]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report
)

In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
    "precision_macro": "precision_macro",
    "recall_macro": "recall_macro",
}
cv_res = cross_validate(best_model, X_train, y_train, cv=cv, scoring=scoring)

print("Cross-validation results (mean ± std over 5 folds)")
for k in scoring.keys():
    vals = cv_res[f"test_{k}"]
    print(f"{k}: {vals.mean():.3f} ± {vals.std():.3f}")



Cross-validation results (mean ± std over 5 folds)
accuracy: 0.886 ± 0.001
f1_macro: 0.847 ± 0.003
precision_macro: 0.814 ± 0.004
recall_macro: 0.889 ± 0.003




In [14]:

# 2) Fit on full training and evaluate on test set
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test) if hasattr(best_model, "predict_proba") else None

acc = accuracy_score(y_test, y_pred)
f1m = f1_score(y_test, y_pred, average="macro")
prec_m = precision_score(y_test, y_pred, average="macro", zero_division=0)
rec_m = recall_score(y_test, y_pred, average="macro")

roc = None
if y_proba is not None:
    n_classes = len(np.unique(y_train))
    if n_classes == 2:
        roc = roc_auc_score(y_test, y_proba[:, 1])
    else:
        roc = roc_auc_score(y_test, y_proba, multi_class="ovr")

cm = confusion_matrix(y_test, y_pred)

print("\nTest set results")
print(f"accuracy: {acc:.3f}")
print(f"f1_macro: {f1m:.3f}")
print(f"precision_macro: {prec_m:.3f}")
print(f"recall_macro: {rec_m:.3f}")
if roc is not None:
    print(f"roc_auc: {roc:.3f}")

print("\nConfusion matrix")
print(cm)

print("\nClassification report")
print(classification_report(y_test, y_pred, digits=3, zero_division=0))




Test set results
accuracy: 0.886
f1_macro: 0.849
precision_macro: 0.812
recall_macro: 0.899
roc_auc: 0.957

Confusion matrix
[[37693  3900    15     0   108    22   629]
 [ 5055 49391   593     2  1067   444   106]
 [    1    82  6579    89    23   377     0]
 [    0     0    57   456     0    36     0]
 [   21   115    14     0  1731    15     2]
 [    0    56   261    24     9  3123     0]
 [  133     3     0     0     0     0  3966]]

Classification report
              precision    recall  f1-score   support

           1      0.879     0.890     0.884     42367
           2      0.922     0.872     0.896     56658
           3      0.875     0.920     0.897      7151
           4      0.799     0.831     0.814       549
           5      0.589     0.912     0.716      1898
           6      0.777     0.899     0.834      3473
           7      0.843     0.967     0.901      4102

    accuracy                          0.886    116198
   macro avg      0.812     0.899     0.849    