In [26]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from xgboost import XGBClassifier

In [27]:
df = pd.read_csv(r"C:\Users\Administrator\Desktop\Major Project\Experiment\Flood_Prediction\balanced_flood_risk.csv")  
df.dropna(inplace=True)

print(df.head())
print(df["Flood Occurred"].value_counts())

   Rainfall (mm)  Temperature (°C)  Humidity (%)  Wind Speed (km/h)  \
0      23.463404         24.440599     74.783533          10.837124   
1      61.562503         35.052576     62.957884          11.627680   
2      10.033949         27.143099     56.138757           2.162353   
3      17.211150         34.751848     54.807533          10.942742   
4      17.275776         26.483281     37.905690           8.111575   

   Flood Occurred  
0               0  
1               0  
2               0  
3               0  
4               0  
Flood Occurred
0    2500
1    2500
Name: count, dtype: int64


In [28]:
X = df.drop("Flood Occurred", axis=1)
y = df["Flood Occurred"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [30]:
num_cols = X.select_dtypes(include='number').columns.tolist()

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]), num_cols)
])

In [31]:
rf_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", RandomForestClassifier(random_state=42))
])

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20]
}

grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

y_pred_rf = grid_search.predict(X_test)
print("Best RF Parameters:", grid_search.best_params_)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

joblib.dump(grid_search.best_estimator_, r"C:\Users\Administrator\Desktop\Major Project\Experiment\Flood_Prediction\balanced_flood_risk.csv")

Best RF Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Random Forest Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       500

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



['C:\\Users\\Administrator\\Desktop\\Major Project\\Experiment\\Flood_Prediction\\balanced_flood_risk.csv']

In [32]:
import joblib

# Save model
joblib.dump(rf_pipeline, r'C:\Users\Administrator\Desktop\Major Project\Experiment\Flood_Prediction\models\rf_model.pkl')

['C:\\Users\\Administrator\\Desktop\\Major Project\\Experiment\\Flood_Prediction\\models\\rf_model.pkl']