<a href="https://colab.research.google.com/github/geen-tech/Richter-s-Predictor-Modeling-Earthquake-Damage/blob/main/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LightGBM Classifier

Questo notebook implementa un modello LightGBM per la previsione dei danni strutturali.
Include preprocessing, training e salvataggio delle predizioni.


# Step 1: Imports



In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
!pip install category_encoders
from category_encoders import TargetEncoder

import pickle

# Step 2: Constants

In [None]:
BASE_PATH = Path().resolve()
TRAINING_FEATURES_PATH = BASE_PATH / 'train_values.csv'
TRAINING_LABELS_PATH = BASE_PATH / 'train_labels.csv'
TEST_FEATURES_PATH = BASE_PATH / 'test_values.csv'

# Step 3: Load Dataset

In [None]:
X = pd.read_csv(TRAINING_FEATURES_PATH, index_col=0)
y_df = pd.read_csv(TRAINING_LABELS_PATH, index_col=0)
y = y_df['label'] if 'label' in y_df.columns else y_df.iloc[:, 0]
X_test_final = pd.read_csv(TEST_FEATURES_PATH, index_col=0)

# Step 4: Preprocessing Pipeline

In [None]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Step 5: LightGBM Pipeline

In [None]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(random_state=42, n_jobs=-1))
])

# Step 6: Cross-Validation Evaluation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_pipeline, X, y, cv=cv, scoring='f1_micro', n_jobs=-1)

print(f"F1_micro CV Score: {scores.mean():.4f} ± {scores.std():.4f}")

# Step 7: Train Final Model

In [None]:
model_pipeline.fit(X, y)

# Step 8: Predict & Save

In [None]:
predictions = model_pipeline.predict(X_test_final)
pd.DataFrame(predictions, index=X_test_final.index, columns=['prediction']).to_csv(BASE_PATH / 'predizioni_lightgbm_pipeline.csv')

with open(BASE_PATH / 'lightgbm_pipeline_model.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

print("Model and predictions saved.")