# Random Search e Nested Cross-Validation per ottimizzare gli iperparametri del classificatore Random Forest 

# Imports 

In [None]:
from pathlib import Path
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import randint
from tqdm.auto import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score, make_scorer
)
from sklearn.model_selection import (
    StratifiedKFold, ParameterSampler, cross_validate
)

# Funzione per visulizzare la percentuale di progresso del codice

In [None]:
def p(msg):
    print(msg, flush=True)

# Wrapper per stampare la progressione percentuale dei fold interni
class KFoldProgress:
    def __init__(self, base_cv):
        self.base_cv = base_cv
        self._n_splits = base_cv.get_n_splits()
        self._seen = set()
        p("      inner progress: 0%")

    def split(self, X, y=None, groups=None):
        for idx, (tr, va) in enumerate(self.base_cv.split(X, y, groups), 1):
            if idx not in self._seen:
                pct = idx / self._n_splits * 100
                p(f"      inner progress: {pct:.0f}%")
                self._seen.add(idx)
            yield tr, va

    def get_n_splits(self, *args, **kwargs):
        return self._n_splits

# Caricamento dati di training 

In [None]:
DATA_DIR = Path('.')
X = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
y = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')['damage_grade']
Xt = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')
try:
    sub_fmt = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='building_id')
except FileNotFoundError:
    sub_fmt = pd.DataFrame(index=Xt.index, columns=['damage_grade'])

p(f"Train {X.shape}  |  Test {Xt.shape}")

# Parte di preprocessing 

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.to_list()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.to_list()
preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols),
    ('num', 'passthrough', num_cols)
])

# Nested Cross-validation per la ricerca degli iperparametri migliori 

In [None]:
# Pipeline con preprocessing e modello
base_rf = RandomForestClassifier(
    bootstrap=True,
    n_jobs=-1,
    random_state=2025,
)
pipe = Pipeline([('prep', preprocess), ('rf', base_rf)])

# Spazio iperparametri
param_dist = {
    'rf__n_estimators': randint(900, 1401),
    'rf__max_depth': [None] + list(range(60, 101, 10)),
    'rf__max_features': np.arange(0.5, 0.8, 0.05),
    'rf__min_samples_split': randint(7, 13),
    'rf__min_samples_leaf': randint(2, 6),
}

# Impostazioni CV
N_ITER = 40
INNER_K = 3
OUTER_K = 5

outer_cv = StratifiedKFold(n_splits=OUTER_K, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=INNER_K, shuffle=True, random_state=2025)

# Per salvare i risultati
metrics_per_fold = {
    'accuracy': [],
    'precision_macro': [],
    'recall_macro': [],
    'f1_micro': [],
    'f1_macro': [],
}
all_best_params = []

for fold, (tr_idx, va_idx) in enumerate(outer_cv.split(X, y), 1):
    print(f"\nFold {fold}/{OUTER_K}")

    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    best_score = -np.inf
    best_params = None

    sampler = ParameterSampler(param_dist, n_iter=N_ITER, random_state=fold)

    for params in tqdm(sampler, desc=f"Inner search fold {fold}"):
        pipe.set_params(**params)
        preds = cross_val_predict(pipe, X_tr, y_tr, cv=inner_cv, n_jobs=-1)
        score = f1_score(y_tr, preds, average='micro')
        if score > best_score:
            best_score = score
            best_params = params

    # Fit finale sul fold esterno
    pipe.set_params(**best_params).fit(X_tr, y_tr)
    y_pred = pipe.predict(X_va)

    # Calcolo metriche
    acc = accuracy_score(y_va, y_pred)
    prec = precision_score(y_va, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_va, y_pred, average='macro', zero_division=0)
    f1_micro = f1_score(y_va, y_pred, average='micro')
    f1_macro = f1_score(y_va, y_pred, average='macro')

    print(f"Fold {fold} - Acc: {acc:.4f} | Prec_macro: {prec:.4f} | Recall_macro: {rec:.4f} | F1_micro: {f1_micro:.4f} | F1_macro: {f1_macro:.4f}")

    # Salva metriche
    metrics_per_fold['accuracy'].append(acc)
    metrics_per_fold['precision_macro'].append(prec)
    metrics_per_fold['recall_macro'].append(rec)
    metrics_per_fold['f1_micro'].append(f1_micro)
    metrics_per_fold['f1_macro'].append(f1_macro)
    all_best_params.append(best_params)

# Risultati finali
print("\n=== Risultati medi sui fold esterni ===")
for metric, values in metrics_per_fold.items():
    mean = np.mean(values)
    std = np.std(values)
    print(f"{metric}: {mean:.4f} ± {std:.4f}")

