## Train

In [3]:
import pandas as pd
import numpy as np
import joblib
from calendar import monthrange
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# === Load Data ===
df = pd.read_csv("flood_features_3.csv")
df = df.replace(-999, pd.NA).dropna()

# Extract year and month from 'date' column
if 'date' in df.columns:
    df['year'] = df['date'].astype(str).str[:4].astype(int)
    df['month'] = df['date'].astype(str).str[4:6].astype(int)

# Remove rows with invalid months (e.g., 13)
df = df[(df['month'] >= 1) & (df['month'] <= 12)]

# === Calculate estimated monthly total precipitation
# (now 'year' and 'month' columns exist)
df["days_in_month"] = df.apply(lambda row: monthrange(int(row["year"]), int(row["month"]))[1], axis=1)
df["PRECTOTCORR_SUM"] = df["PRECTOTCORR"] * df["days_in_month"]

# === Label as flood if monthly precip exceeds 100mm (adjust if needed)
df["flood"] = df["PRECTOTCORR_SUM"].apply(lambda x: 1 if x > 100 else 0)

# === Feature Columns ===
features = ["PRECTOTCORR", "RH2M", "QV2M", "GWETROOT", "GWETPROF", "GWETTOP", "CLOUD_AMT"]

# === Train/Test Split by Time
df["is_train"] = df["year"] <= 2020
X_train = df[df["is_train"]][features]
y_train = df[df["is_train"]]["flood"]
X_test = df[~df["is_train"]][features]
y_test = df[~df["is_train"]]["flood"]

# === Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Evaluate
y_pred = model.predict(X_test)
print("✅ Classification Report:\n", classification_report(y_test, y_pred))
print("🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# === Save the model
joblib.dump(model, "flood_model.pkl")
print("💾 Saved model to flood_model.pkl")


✅ Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     13394
           1       1.00      0.94      0.97      6010

    accuracy                           0.98     19404
   macro avg       0.99      0.97      0.98     19404
weighted avg       0.98      0.98      0.98     19404

🧩 Confusion Matrix:
 [[13394     0]
 [  360  5650]]
💾 Saved model to flood_model.pkl


## Predict

In [4]:
def predict_flood(lat, lon, year, month, features_file="flood_features_3.csv", model_file="flood_model.pkl"):
    import pandas as pd
    import joblib

    df = pd.read_csv(features_file)
    model = joblib.load(model_file)

    row = df[
        (df["lat"].round(4) == round(lat, 4)) &
        (df["lon"].round(4) == round(lon, 4)) &
        (df["year"] == year) &
        (df["month"] == month)
    ]

    if row.empty:
        print("⚠️ No data found for the given point/date.")
        return None

    features = ["PRECTOTCORR", "RH2M", "QV2M", "GWETROOT", "GWETPROF", "GWETTOP", "CLOUD_AMT"]
    X = row[features]
    pred = model.predict(X)
    return int(pred[0])


## Batch Prediction File

In [5]:
# Load model
model = joblib.load("flood_model.pkl")

# Predict for all rows
df["predicted_flood"] = model.predict(df[features])
df[["lat", "lon", "year", "month", "predicted_flood"]].to_csv("flood_predictions.csv", index=False)
print("📤 Saved flood_predictions.csv")


📤 Saved flood_predictions.csv
