In [43]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# --------------------
# 1. Load data
# --------------------
df = pd.read_csv("../cleaned_cloudburst_data.csv")

# Features for training
selected_features = [
    "Rainfall", "Evaporation",
    "Humidity9am", "Humidity3pm",
    "Pressure9am", "Pressure3pm",
    "Cloud9am", "Cloud3pm",
    "Temp9am", "Temp3pm",
    "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm",
    "WindDir9am_sin", "WindDir9am_cos",
    "WindDir3pm_sin", "WindDir3pm_cos"
]

X = df[selected_features]
y = df["CloudBurstTodayBinary"]

# --------------------
# 2. Train/test split
# --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------
# 3. Train model
# --------------------
model = LogisticRegression(
    max_iter=1000, 
    solver="liblinear",
    class_weight="balanced"
)
model.fit(X_train, y_train)

# --------------------
# 4. Evaluate
# --------------------
y_pred = model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Cross-validation accuracy
cv_scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print(f"\nCross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# --------------------
# 5. Save model
# --------------------
joblib.dump(model, "logistic_model.pkl")
print("\n✅ Model saved as 'logistic_model.pkl'")

# Testing
model = joblib.load("logistic_model.pkl")

user_input = pd.DataFrame([{
    "Rainfall": 10,
    "Evaporation": 3,
    "Humidity9am": 75,
    "Humidity3pm": 55,
    "Pressure9am": 1012,
    "Pressure3pm": 1010,
    "Cloud9am": 40,
    "Cloud3pm": 50,
    "Temp9am": 20,
    "Temp3pm": 28,
    "WindGustSpeed": 25,
    "WindSpeed9am": 12,
    "WindSpeed3pm": 18,
    "WindDir9am_sin": 0.3,
    "WindDir9am_cos": 0.8,
    "WindDir3pm_sin": -0.4,
    "WindDir3pm_cos": 0.9
}])

proba = model.predict_proba(user_input)[:, 1]
print(f"Predicted Cloudburst Probability: {proba[0]*100:.2f}%")

Confusion Matrix:
[[12341     0]
 [    0  3943]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12341
         1.0       1.00      1.00      1.00      3943

    accuracy                           1.00     16284
   macro avg       1.00      1.00      1.00     16284
weighted avg       1.00      1.00      1.00     16284


Cross-validation accuracy: 1.0000 ± 0.0000

✅ Model saved as 'logistic_model.pkl'
Predicted Cloudburst Probability: 100.00%
