In [9]:
import pandas as pd
from calendar import monthrange
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# === Load Data ===
df = pd.read_csv("flood_features.csv")

# === Drop missing / placeholder values ===
df = df.replace(-999, pd.NA).dropna()
df = df[df["month"].between(1, 12)]


# === Estimate monthly total precipitation ===
df["days_in_month"] = df.apply(lambda row: monthrange(int(row["year"]), int(row["month"]))[1], axis=1)
df["PRECTOTCORR_SUM"] = df["PRECTOTCORR"] * df["days_in_month"]

# === Add simulated flood label ===
# You can adjust this threshold as needed — 100mm/month is a typical hydrology cutoff
df["flood"] = df["PRECTOTCORR_SUM"].apply(lambda x: 1 if x > 100 else 0)

# === Features and label ===
feature_cols = ["PRECTOTCORR", "RH2M", "QV2M", "GWETROOT", "GWETPROF", "GWETTOP", "CLOUD_AMT"]
X = df[feature_cols]
y = df["flood"]

# === Train/test split based on time (train on 2010–2020, test on 2021–2024) ===
df["is_train"] = df["year"] <= 2020
X_train = X[df["is_train"]]
y_train = y[df["is_train"]]
X_test = X[~df["is_train"]]
y_test = y[~df["is_train"]]

# === Train the model ===
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# === Evaluate ===
y_pred = model.predict(X_test)
print("✅ Classification Report:\n", classification_report(y_test, y_pred))
print("🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96       162
           1       1.00      0.87      0.93        90

    accuracy                           0.95       252
   macro avg       0.97      0.93      0.95       252
weighted avg       0.96      0.95      0.95       252

🧩 Confusion Matrix:
 [[162   0]
 [ 12  78]]
