<a href="https://colab.research.google.com/github/geen-tech/Richter-s-Predictor-Modeling-Earthquake-Damage/blob/main/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest Classifier
Questo notebook implementa un modello Random Forest per la previsione dei danni strutturali. Include preprocessing, training e salvataggio delle predizioni.

# Step 1: Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score


# Step 2: Progress Bar Helper

In [None]:
def progress(p):
    sys.stdout.write(f"\rAvanzamento: {p:3d}%")
    sys.stdout.flush()
    if p >= 100:
        sys.stdout.write("\n")

# Step 3: Load Data

In [None]:
BASE = Path().resolve()
progress(0)
X = pd.read_csv(BASE / 'train_values.csv', index_col='building_id')
y = pd.read_csv(BASE / 'train_labels.csv', index_col='building_id')['damage_grade']
Xt = pd.read_csv(BASE / 'test_values.csv', index_col='building_id')
progress(10)

# Step 4: Preprocessing

In [None]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
for c in cat_cols:
    X[c] = X[c].astype('category')
    if c in Xt.columns:
        Xt[c] = Xt[c].astype('category')

nunique = X.nunique()
low_var = nunique[nunique <= 1].index.tolist()
if low_var:
    X.drop(columns=low_var, inplace=True)
    Xt.drop(columns=low_var, inplace=True, errors='ignore')
    cat_cols = [c for c in cat_cols if c not in low_var]

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
imp = SimpleImputer(strategy='median')
X_num = pd.DataFrame(imp.fit_transform(X[num_cols]), columns=num_cols, index=X.index)
Xt_num = pd.DataFrame(imp.transform(Xt[num_cols]), columns=num_cols, index=Xt.index)
for col in num_cols:
    lower = X_num[col].quantile(0.01)
    upper = X_num[col].quantile(0.99)
    X_num[col] = X_num[col].clip(lower, upper)
    Xt_num[col] = Xt_num[col].clip(lower, upper)
X_num['missing_count'] = X[num_cols].isnull().sum(axis=1)
Xt_num['missing_count'] = Xt[num_cols].isnull().sum(axis=1)

for a, b in [(1,2), (1,3), (2,3)]:
    X_num[f'geo_sum_{a}{b}'] = X[f'geo_level_{a}_id'] + X[f'geo_level_{b}_id']
    Xt_num[f'geo_sum_{a}{b}'] = Xt[f'geo_level_{a}_id'] + Xt[f'geo_level_{b}_id']
    X_num[f'geo_prod_{a}{b}'] = X[f'geo_level_{a}_id'] * X[f'geo_level_{b}_id']
    Xt_num[f'geo_prod_{a}{b}'] = Xt[f'geo_level_{a}_id'] * Xt[f'geo_level_{b}_id']

eps = 1e-5
X_num['geo_prod_123'] = X['geo_level_1_id'] * X['geo_level_2_id'] * X['geo_level_3_id']
Xt_num['geo_prod_123'] = Xt['geo_level_1_id'] * Xt['geo_level_2_id'] * Xt['geo_level_3_id']
for a, b in [(1,2), (1,3), (2,3)]:
    X_num[f'geo_div_{a}{b}'] = X[f'geo_level_{a}_id'] / (X[f'geo_level_{b}_id'] + eps)
    Xt_num[f'geo_div_{a}{b}'] = Xt[f'geo_level_{a}_id'] / (Xt[f'geo_level_{b}_id'] + eps)

preprocessor = ColumnTransformer([
    ('num', 'passthrough', X_num.columns.tolist()),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols)
])

Xp = preprocessor.fit_transform(pd.concat([X_num, X[cat_cols]], axis=1))
Xt_p = preprocessor.transform(pd.concat([Xt_num, Xt[cat_cols]], axis=1))

progress(30)

# Step 5: Cross-Validation Metrics

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2025)
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_micro': make_scorer(f1_score, average='micro'),
    'precision_macro': make_scorer(precision_score, average='macro'),
    'recall_macro': make_scorer(recall_score, average='macro')
}

rf_cv = RandomForestClassifier(
    n_estimators=953,
    max_depth=None,
    max_features=0.5,
    min_samples_split=9,
    min_samples_leaf=3,
    bootstrap=True,
    n_jobs=-1,
    random_state=2025
)

cv_results = cross_validate(rf_cv, Xp, y, cv=skf, scoring=scoring, n_jobs=-1)
print("\nCross-Validation Metrics (5-fold):")
for metric in scoring.keys():
    scores = cv_results[f'test_{metric}']
    print(f" - {metric}: {scores.mean():.4f} ± {scores.std():.4f}")

# Step 6: Model Training with Progress

In [None]:
n_total = 953
chunk = 100
rf = RandomForestClassifier(
    n_estimators=0,
    warm_start=True,
    max_depth=None,
    max_features=0.5,
    min_samples_split=9,
    min_samples_leaf=3,
    bootstrap=True,
    n_jobs=-1,
    random_state=2025
)

cum = 0
while cum < n_total:
    add = min(chunk, n_total - cum)
    rf.set_params(n_estimators=cum + add)
    rf.fit(Xp, y)
    cum += add
    pct = 30 + int(cum / n_total * 60)
    progress(pct)

progress(90)


# Step 7: Predict & Save

In [None]:
pred = rf.predict(Xt_p)
pd.DataFrame(pred, index=Xt.index, columns=['damage_grade']).to_csv(BASE / 'RF_preprocessed.csv')

progress(100)
print("File 'RF_preprocessed.csv' salvato")
