<a href="https://colab.research.google.com/github/deepinderthind11/deepinderthind11/blob/main/FloodPredictionbyDeep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kaggle: Flood Prediction

In [10]:
!pip install optuna
!pip install shap

Collecting shap
  Downloading shap-0.45.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.45.1 slicer-0.0.8


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression

# Load the dataset
data = pd.read_csv('flood.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,3,8,6,6,4,4,6,2,3,2,...,10,7,4,2,3,4,3,2,6,0.45
1,8,4,5,7,7,9,1,5,5,4,...,9,2,6,2,1,1,9,1,3,0.475
2,3,10,4,1,7,5,4,7,4,9,...,7,4,4,8,6,1,8,3,6,0.515
3,4,4,2,7,3,4,1,4,6,4,...,4,2,6,6,8,8,6,6,10,0.52
4,3,7,5,2,5,8,5,2,7,5,...,7,6,5,3,3,4,4,3,4,0.475


## Data Preprocessing

In [3]:
# Split the data into features and target
X = data.drop(columns=['FloodProbability'])
y = data['FloodProbability']

# Feature engineering: Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection
selector = SelectKBest(f_regression, k=20)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

## Model Building and Training with Hyperparameter Tuning using Optuna

In [4]:
import optuna
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Define objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 100, 200)
    rf_max_depth = trial.suggest_int('rf_max_depth', 10, 30)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)

    gb_n_estimators = trial.suggest_int('gb_n_estimators', 100, 200)
    gb_learning_rate = trial.suggest_float('gb_learning_rate', 0.01, 0.2)
    gb_max_depth = trial.suggest_int('gb_max_depth', 3, 6)

    # Define models with suggested hyperparameters
    rf_model = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        random_state=42
    )

    gb_model = GradientBoostingRegressor(
        n_estimators=gb_n_estimators,
        learning_rate=gb_learning_rate,
        max_depth=gb_max_depth,
        random_state=42
    )

    # Train and evaluate models
    rf_model.fit(X_train_selected, y_train)
    gb_model.fit(X_train_selected, y_train)

    rf_pred = rf_model.predict(X_test_selected)
    gb_pred = gb_model.predict(X_test_selected)

    rf_mse = mean_squared_error(y_test, rf_pred)
    gb_mse = mean_squared_error(y_test, gb_pred)

    return (rf_mse + gb_mse) / 2

# Optimize hyperparameters with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)

# Print best hyperparameters
print('Best hyperparameters: ', study.best_params)

[I 2024-05-19 05:06:00,012] A new study created in memory with name: no-name-b9327c19-8372-4bf0-afb4-442cb8101028
[I 2024-05-19 05:07:06,965] Trial 0 finished with value: 0.0013773814240611948 and parameters: {'rf_n_estimators': 102, 'rf_max_depth': 29, 'rf_min_samples_split': 3, 'rf_min_samples_leaf': 1, 'gb_n_estimators': 152, 'gb_learning_rate': 0.03056417720771276, 'gb_max_depth': 3}. Best is trial 0 with value: 0.0013773814240611948.
[I 2024-05-19 05:08:02,850] Trial 1 finished with value: 0.0013439622254661696 and parameters: {'rf_n_estimators': 109, 'rf_max_depth': 17, 'rf_min_samples_split': 3, 'rf_min_samples_leaf': 5, 'gb_n_estimators': 154, 'gb_learning_rate': 0.0507035290623724, 'gb_max_depth': 3}. Best is trial 1 with value: 0.0013439622254661696.
[I 2024-05-19 05:09:13,863] Trial 2 finished with value: 0.0013346262765832348 and parameters: {'rf_n_estimators': 140, 'rf_max_depth': 23, 'rf_min_samples_split': 3, 'rf_min_samples_leaf': 9, 'gb_n_estimators': 127, 'gb_learning

Best hyperparameters:  {'rf_n_estimators': 140, 'rf_max_depth': 23, 'rf_min_samples_split': 3, 'rf_min_samples_leaf': 9, 'gb_n_estimators': 127, 'gb_learning_rate': 0.11817001226503293, 'gb_max_depth': 6}


## Training the Best Model


In [7]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# Extract best hyperparameters
best_params = study.best_params

best_rf_model = RandomForestRegressor(
    n_estimators=best_params['rf_n_estimators'],
    max_depth=best_params['rf_max_depth'],
    min_samples_split=best_params['rf_min_samples_split'],
    min_samples_leaf=best_params['rf_min_samples_leaf'],
    random_state=42
)

best_gb_model = GradientBoostingRegressor(
    n_estimators=best_params['gb_n_estimators'],
    learning_rate=best_params['gb_learning_rate'],
    max_depth=best_params['gb_max_depth'],
    random_state=42
)

# Ensure models are fitted
best_rf_model.fit(X_train_selected, y_train)
best_gb_model.fit(X_train_selected, y_train)

# Ensemble model with stacking
estimators = [
    ('rf', best_rf_model),
    ('gb', best_gb_model)
]
stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge())
stacking_model.fit(X_train_selected, y_train)


## Model Evaluation


In [8]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Predict on the test set
y_pred = stacking_model.predict(X_test_selected)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'Test Set MSE: {mse:.4f}')
print(f'Test Set R2: {r2:.4f}')
print(f'Test Set MAE: {mae:.4f}')


Test Set MSE: 0.0013
Test Set R2: 0.4733
Test Set MAE: 0.0290
