<a href="https://colab.research.google.com/github/fact-h/LightGBM-for-flooding-prdiction/blob/main/LightGBM4flooding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Rapid Prediction Model for Urban Floods Based on a Light Gradient Boosting Machine Approach and Hydrological–Hydraulic Model](https://doi.org/10.1007/s13753-023-00465-2)

- Target: Predicting the maximum water depth of 7 sites based on the sequences of rainfall and tide level.
- ML model: [LightGBM](https://lightgbm.readthedocs.io/en/latest/)
- Inputs: Ten features of rainfall and tide level and one feature about the location of flooded sites.
- Output: Maximum water depth

## Modules import

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot as plt

%matplotlib inline

try:
    # To enable interactive mode you should install ipywidgets
    # https://github.com/jupyter-widgets/ipywidgets
    from ipywidgets import interact, SelectMultiple
    INTERACTIVE = True
except ImportError:
    INTERACTIVE = False

## Load the csv files

- `test_df.csv`
- `train_valid.csv`

`test_df.csv` is the test set. `train_valid.csv` includes the training set and validation set.


In [None]:
test_df = pd.read_csv('/content/test_df.csv')
train_valid = pd.read_csv('/content/train_valid.csv')

# `Point` is set to categorical feature, instead of one-hot encoding when training LightGBM.
train_valid['Point'].astype('category')
test_df['Point'].astype('category')

### Define inputs `X` and outputs `y` for the trianing set, validation set, and test set.


In [None]:
# There are 342 samples in the training data set,
# 98 samples in the validation set,
# and 50 samples in the test set.

val_split = round(0.2 * 490)

y_test = test_df.depth                                    # shape: 50 * 1
y_val = train_valid.depth[0:val_split]                    # shape: 98 * 1
y_train = train_valid.depth[val_split:]                   # shape: 342 * 1

X_test = test_df.drop(['depth'], axis=1)                  # shape: 50 * 11
X_val = train_valid[0:val_split].drop(['depth'], axis=1)  # shape: 98 * 11
X_train = train_valid[val_split:].drop(['depth'], axis=1) # shape: 342 * 11

##  Train the LightGBM model

In [None]:
evals_result = {}                      # record the evaluation results for plotting
gbm = lgb.LGBMRegressor(learning_rate=0.04,
                        n_estimators=500,
                        n_jobs=4)
gbm.fit(X_train,y_train,
        categorical_feature=['Point'], # specific the categorical feature
        eval_set=[(X_val,y_val)],
        eval_metric=['rmse','l1'],
        callbacks=[lgb.early_stopping(5), lgb.record_evaluation(evals_result)],
        verbose=False)

New categorical_feature is ['Point']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 5 rounds.
Early stopping, best iteration is:
[460]	valid_0's rmse: 0.247512	valid_0's l2: 0.0612621	valid_0's l1: 0.18519


LGBMRegressor(learning_rate=0.04, n_estimators=500, n_jobs=4)

## Apply the one-hot encoding on the categorical feature

In [None]:
object_cols = ['Point']

def oneHot(X_train, X_val, X_test, object_cols):

  OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
  OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
  OH_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_val[object_cols]))
  OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(X_test[object_cols]))

  # recover index deleted by one-hot encoding
  OH_cols_train.index = X_train.index
  OH_cols_valid.index = X_val.index
  OH_cols_test.index = X_test.index

  # delete the original category column
  num_X_train = X_train.drop(object_cols, axis=1)
  num_X_valid = X_val.drop(object_cols, axis=1)
  num_X_test = X_test.drop(object_cols, axis=1)

  # add the one-hot encoding on the feature dataframe.
  OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
  OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
  OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

  return OH_X_train, OH_X_valid, OH_X_test

OH_X_train, OH_X_valid, OH_X_test = oneHot(X_train, X_val, X_test, object_cols)

## Train the Random Forest model

In [None]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(OH_X_train,y_train)



RandomForestRegressor(random_state=1)

## Train the XGBoost model

In [None]:
xgb_model = XGBRegressor(learning_rate=0.03, n_estimators=300, n_jobs=4)
xgb_model.fit(OH_X_train,y_train,
              early_stopping_rounds=5,
              eval_set=[(OH_X_valid,y_val)],
              verbose=False)



XGBRegressor(learning_rate=0.03, n_estimators=300, n_jobs=4)

# Train the Support Vector Regression model

In [None]:
SVR_model = SVR()
SVR_model.fit(OH_X_train, y_train)



SVR()

## Train the Decision Tree model

In [None]:
DT_model = DecisionTreeRegressor()
DT_model.fit(OH_X_train, y_train)



DecisionTreeRegressor()

## Train the k-nearest neighbors model

In [None]:
KNN_model = KNeighborsRegressor()
KNN_model.fit(OH_X_train, y_train)



KNeighborsRegressor()

## Plot the metric during the process of model training

In [None]:
def render_metric(metric_name):
    ax = lgb.plot_metric(evals_result, metric=metric_name, figsize=(10, 5))
    plt.show()

if INTERACTIVE:
    # create widget to switch between metrics
    interact(render_metric, metric_name=['rmse','l2'])
else:
    render_metric('rmse')

interactive(children=(Dropdown(description='metric_name', options=('rmse', 'l2'), value='rmse'), Output()), _d…

## Plot the feature importance

In [None]:
def render_plot_importance(importance_type, max_features=10,
                           ignore_zero=True, precision=3):
    ax = lgb.plot_importance(gbm, importance_type=importance_type,
                             max_num_features=max_features,
                             ignore_zero=ignore_zero, figsize=(12, 8),
                             precision=precision)
    plt.show()

if INTERACTIVE:
    # create widget for interactive feature importance plot
    interact(render_plot_importance,
             importance_type=['split', 'gain'],
             max_features=(1, X_train.shape[-1]),
             precision=(0, 10))
else:
    render_plot_importance(importance_type='split')

interactive(children=(Dropdown(description='importance_type', options=('split', 'gain'), value='split'), IntSl…

## Predict and evaluate the models on the test data set
Metrics:
- RMSE
- MAPE
- SSE
- NSE
- $R^2_{adjusted}$



## Evaluate the LightGBM model

In [None]:
# evalution function
def modelEva(y_test, y_pred, modelName):

  # RMSE
  rmse_test = mean_squared_error(y_test, y_pred) ** 0.5

  y_pred_raw = y_pred * 0.2497080940919035 + 0.7216249550788679
  y_test_raw = y_test * 0.2497080940919035 + 0.7216249550788679
  rmse_test_raw = mean_squared_error(y_test_raw,y_pred_raw) ** 0.5 # mse加根号即是rmse
  print(f'The RMSE of prediction is: {rmse_test_raw}')

  # MAPE
  mape = mean_absolute_percentage_error(y_test_raw,y_pred_raw)
  print(f'The MAPE of prediction is: {mape}')

  # SSE and NSE
  H_obs = y_test
  H_m = y_pred
  H_obs_mean = H_obs.mean()
  SSE = ((H_obs - H_m)**2).sum()
  SST = ((H_obs - H_obs_mean)**2).sum()
  NSE = 1 - SSE / SST
  n = y_test.shape[0]
  k = X_test.shape[1]

  # R^2_{adjusted}
  Adjusted_R2 = 1 - (1 - NSE) * (n-1) / (n-k-1)
  print(f'The SSE of prediction is: {SSE}')
  print(f'The NSE of prediction is: {NSE}')
  print(f'The adjusted R2 of prediction is: {Adjusted_R2}')

  data = [rmse_test_raw, mape, SSE, NSE, Adjusted_R2]
  index = ['RMSE','MAPE','SSE','NSE','R2_adj']
  columns = [modelName]
  return result = pd.DataFrame(data=data,index=index,columns=columns)

In [None]:
# predict of LightGBM
y_pred = gbm.predict(X_test,num_iteration=gbm.best_iteration_)
modelName = 'LightGBM'
LightGBM_result = modelEva(y_test, y_pred, modelName)

The RMSE of raw prediction is: 0.03455421711024856
The MAPE of raw prediction is: 0.047455704102190424
The SSE of raw prediction is: 0.9574296657830896
The NSE of raw prediction is: 0.9854019807750426
The adjusted R2 of raw prediction is: 0.9811762383678181


## Evaluate the RF model

In [None]:
RF_pred = forest_model.predict(OH_X_test)
modelName = 'RF'
RF_result = modelEva(y_test, RF_pred, modelName)

The RMSE of raw prediction is: 0.03977443282712665
The MAPE of raw prediction is: 0.07592471275000459
The SSE of raw prediction is: 1.2685650890382454
The NSE of raw prediction is: 0.9806580700183928
The adjusted R2 of raw prediction is: 0.9673188079621119




# Evaluate the XGBoost model

In [None]:
xgb_pred = xgb_model.predict(OH_X_test)
modelName = 'XGBoost'
XGBoost_result = modelEva(y_test, xgb_pred, modelName)

The RMSE of raw prediction is: 0.036647910104696875
The MAPE of raw prediction is: 0.054807523162045534
The SSE of raw prediction is: 1.0769689132535705
The NSE of raw prediction is: 0.9835793547429948
The adjusted R2 of raw prediction is: 0.9722547718071293


## Evaluate the SVR model

In [None]:
SVR_pred = SVR_model.predict(OH_X_test)
modelName = 'SVR'
SVR_result = modelEva(y_test, SVR_pred, modelName)

The RMSE of raw prediction is: 0.0562998854841833
The MAPE of raw prediction is: 0.10292485640531847
The SSE of raw prediction is: 2.5416736557002038
The NSE of raw prediction is: 0.9612468652105821
The adjusted R2 of raw prediction is: 0.9345205653558112




## Evaluate the DT model

In [None]:
DT_pred = DT_model.predict(OH_X_test)
modelName = 'DT'
DT_result = modelEva(y_test, DT_pred, modelName)

The RMSE of raw prediction is: 0.041395692009998805
The MAPE of raw prediction is: 0.05128257055424308
The SSE of raw prediction is: 1.3740896192561227
The NSE of raw prediction is: 0.979049127684686
The adjusted R2 of raw prediction is: 0.9646002502258487




## Evaluate the KNN model

In [None]:
KNN_pred = KNN_model.predict(OH_X_test)
modelName = 'KNN'
KNN_result = modelEva(y_test, KNN_pred, modelName)

The RMSE of raw prediction is: 0.10208212639358936
The MAPE of raw prediction is: 0.19249710811659665
The SSE of raw prediction is: 8.356110615427026
The NSE of raw prediction is: 0.8725936037190742
The adjusted R2 of raw prediction is: 0.7847271235253324




In [None]:
pd.concat([LightGBM_result, RF_result, XGBoost_result, SVR_result, KNN_result], axis='columns')

Unnamed: 0,LightGBM,RF,XGBoost,SVR,KNN
RMSE,0.034554,0.039774,0.036648,0.0563,0.102082
MAPE,0.047456,0.075925,0.054808,0.102925,0.192497
SSE,0.95743,1.268565,1.076969,2.541674,8.356111
NSE,0.985402,0.980658,0.983579,0.961247,0.872594
R2_adj,0.981176,0.967319,0.972255,0.934521,0.784727


### Save the ground true value and predicted value

In [None]:
depth = [list(y_test_raw), list(y_pred_raw), list(RF_pred_raw), list(xgb_pred_raw), list(SVR_pred_raw), list(DT_pred_raw), list(KNN_pred_raw)]
depth = np.transpose(depth)
cols = ['Real', 'LGBM', 'RF', 'XGBoost', 'SVR', 'DF', 'KNN']
df = pd.DataFrame(data=depth,columns=cols)
df.to_csv('result.csv')