# **Training and Testing Data Preparation**

## **Import Required Libraries**

In [25]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## **Read the Data**

In [26]:
# Read the flood samples data
gdf = gpd.read_file("D:\Research Works\Flood\Flood_Risk_Zonation_of_Maldah\Datasets\Shapefiles\Flood_Sample_Data.shp")
print(gdf.shape)
gdf.head()

(2260, 20)


Unnamed: 0,Relief_Amp,Dist_to_Ri,LULC,TWI,Rainfall,Clay_Conte,STI,TRI,TPI,SPI,NDVI,Slope,MFI,Elevation,Flood,MNDWI,Drainage_D,Lithology,Geomorphol,geometry
0,0.0,0.12973,3.0,0.541704,0.361115,0.0,0.0,0.0,0.587173,3e-06,0.50675,0.094584,0.332813,0.12,1,0.578332,0.664905,4.0,1.0,POINT (88.23513 25.05760)
1,0.061224,0.268667,3.0,0.181033,0.798696,0.714286,0.0,0.163975,0.512312,0.0,0.499448,0.021498,0.683954,0.253333,1,0.430116,0.267209,4.0,1.0,POINT (87.84919 25.19956)
2,0.183673,0.407958,3.0,0.28866,0.795741,0.690476,0.000213,0.307235,0.425566,2e-06,0.484023,0.260136,0.787325,0.333333,1,0.28613,0.290536,4.0,1.0,POINT (88.06961 25.31082)
3,0.0,0.122921,3.0,0.526838,0.369409,0.0,0.0,0.0,0.519969,2e-06,0.563438,0.027192,0.341595,0.2,1,0.445695,0.656115,4.0,1.0,POINT (88.23191 25.06224)
4,0.428571,0.012231,3.0,0.241343,0.864844,0.738095,0.000693,0.517983,0.520828,5e-06,0.598922,0.297545,0.857026,0.48,1,0.338596,0.424167,4.0,6.0,POINT (88.06493 25.25316)


## **Preprocess the Data**

In [27]:
# Check the column informations
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Relief_Amp  2260 non-null   float64 
 1   Dist_to_Ri  2260 non-null   float64 
 2   LULC        2260 non-null   float64 
 3   TWI         2260 non-null   float64 
 4   Rainfall    2260 non-null   float64 
 5   Clay_Conte  2260 non-null   float64 
 6   STI         2260 non-null   float64 
 7   TRI         2260 non-null   float64 
 8   TPI         2260 non-null   float64 
 9   SPI         2260 non-null   float64 
 10  NDVI        2260 non-null   float64 
 11  Slope       2260 non-null   float64 
 12  MFI         2260 non-null   float64 
 13  Elevation   2260 non-null   float64 
 14  Flood       2260 non-null   int64   
 15  MNDWI       2260 non-null   float64 
 16  Drainage_D  2260 non-null   float64 
 17  Lithology   2260 non-null   float64 
 18  Geomorphol  2260 non-null   float64 
 19

In [28]:
# Change the name of the columns
gdf.columns

Index(['Relief_Amp', 'Dist_to_Ri', 'LULC', 'TWI', 'Rainfall', 'Clay_Conte',
       'STI', 'TRI', 'TPI', 'SPI', 'NDVI', 'Slope', 'MFI', 'Elevation',
       'Flood', 'MNDWI', 'Drainage_D', 'Lithology', 'Geomorphol', 'geometry'],
      dtype='object')

In [29]:
# Change the name of the columns
new_col_names = ['Relief_Amplitude', 'Dist_to_River', 'LULC', 'TWI', 'Rainfall', 
                 'Clay_Content', 'STI', 'TRI', 'TPI',
                 'SPI', 'NDVI', 'Slope', 'MFI', 'Elevation', 'Flood', 'MNDWI',
                 'Drainage_Density', 'Lithology', 'Geomorphology', 'geometry']

# Create a dictionary
new_col_dict = dict(zip(gdf.columns, new_col_names))

# Change the name of the columns
gdf.rename(columns=new_col_dict, inplace=True)

In [30]:
gdf.head()

Unnamed: 0,Relief_Amplitude,Dist_to_River,LULC,TWI,Rainfall,Clay_Content,STI,TRI,TPI,SPI,NDVI,Slope,MFI,Elevation,Flood,MNDWI,Drainage_Density,Lithology,Geomorphology,geometry
0,0.0,0.12973,3.0,0.541704,0.361115,0.0,0.0,0.0,0.587173,3e-06,0.50675,0.094584,0.332813,0.12,1,0.578332,0.664905,4.0,1.0,POINT (88.23513 25.05760)
1,0.061224,0.268667,3.0,0.181033,0.798696,0.714286,0.0,0.163975,0.512312,0.0,0.499448,0.021498,0.683954,0.253333,1,0.430116,0.267209,4.0,1.0,POINT (87.84919 25.19956)
2,0.183673,0.407958,3.0,0.28866,0.795741,0.690476,0.000213,0.307235,0.425566,2e-06,0.484023,0.260136,0.787325,0.333333,1,0.28613,0.290536,4.0,1.0,POINT (88.06961 25.31082)
3,0.0,0.122921,3.0,0.526838,0.369409,0.0,0.0,0.0,0.519969,2e-06,0.563438,0.027192,0.341595,0.2,1,0.445695,0.656115,4.0,1.0,POINT (88.23191 25.06224)
4,0.428571,0.012231,3.0,0.241343,0.864844,0.738095,0.000693,0.517983,0.520828,5e-06,0.598922,0.297545,0.857026,0.48,1,0.338596,0.424167,4.0,6.0,POINT (88.06493 25.25316)


## **Rename the Values of the Categorical Variables**

In [31]:
# Define the values for the geomorphology
geomorpholoy_dict = {1: "Active_Flood_Plain",
                     2: "Embankment",
                     3: "Older_Alluvial_Plain",
                     4: "Older_Flood_Plain",
                     5: "Pond",
                     6: "River",
                     7: "WatBod_Lake",
                     8: "Younger_Alluvial_Plain"}

# Define the values for the lithology
lithology_dict = {1: "Cl_wi_S_Si_Ir_N",
                  2: "Fe_Ox_S_Si_Cl",
                  3: "S_Si_Gr",
                  4: "S_Si_Cl",
                  5: "S_Si_Cl_wi_Cal_Co"}

# Define the values for the LULC
lulc_dict = {1: "Waterbodies",
             2: "Natural_Vegetation",
             3: "Agricultural_Field",
             4: "Bare_Ground",
             5: "Built_UP_Area"}

In [32]:
gdf.replace({"Geomorphology": geomorpholoy_dict, "Lithology": lithology_dict, "LULC": lulc_dict},
             inplace=True)
gdf.head()

Unnamed: 0,Relief_Amplitude,Dist_to_River,LULC,TWI,Rainfall,Clay_Content,STI,TRI,TPI,SPI,NDVI,Slope,MFI,Elevation,Flood,MNDWI,Drainage_Density,Lithology,Geomorphology,geometry
0,0.0,0.12973,Agricultural_Field,0.541704,0.361115,0.0,0.0,0.0,0.587173,3e-06,0.50675,0.094584,0.332813,0.12,1,0.578332,0.664905,S_Si_Cl,Active_Flood_Plain,POINT (88.23513 25.05760)
1,0.061224,0.268667,Agricultural_Field,0.181033,0.798696,0.714286,0.0,0.163975,0.512312,0.0,0.499448,0.021498,0.683954,0.253333,1,0.430116,0.267209,S_Si_Cl,Active_Flood_Plain,POINT (87.84919 25.19956)
2,0.183673,0.407958,Agricultural_Field,0.28866,0.795741,0.690476,0.000213,0.307235,0.425566,2e-06,0.484023,0.260136,0.787325,0.333333,1,0.28613,0.290536,S_Si_Cl,Active_Flood_Plain,POINT (88.06961 25.31082)
3,0.0,0.122921,Agricultural_Field,0.526838,0.369409,0.0,0.0,0.0,0.519969,2e-06,0.563438,0.027192,0.341595,0.2,1,0.445695,0.656115,S_Si_Cl,Active_Flood_Plain,POINT (88.23191 25.06224)
4,0.428571,0.012231,Agricultural_Field,0.241343,0.864844,0.738095,0.000693,0.517983,0.520828,5e-06,0.598922,0.297545,0.857026,0.48,1,0.338596,0.424167,S_Si_Cl,River,POINT (88.06493 25.25316)


## **Train Test Split**

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(gdf.drop("Flood", axis=1),
                                                    gdf["Flood"],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((1582, 19), (678, 19))

## **Apply OHE on 'Geomorphology' and 'Lithology' Columns**

In [35]:
# Apply One Hot Encoding on the training data
X_train_encoded = pd.get_dummies(X_train, columns=["Geomorphology", "Lithology", "LULC"])
X_train_encoded.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1582 entries, 189 to 1653
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   Relief_Amplitude                      1582 non-null   float64 
 1   Dist_to_River                         1582 non-null   float64 
 2   TWI                                   1582 non-null   float64 
 3   Rainfall                              1582 non-null   float64 
 4   Clay_Content                          1582 non-null   float64 
 5   STI                                   1582 non-null   float64 
 6   TRI                                   1582 non-null   float64 
 7   TPI                                   1582 non-null   float64 
 8   SPI                                   1582 non-null   float64 
 9   NDVI                                  1582 non-null   float64 
 10  Slope                                 1582 non-null   float64 

In [36]:
# Apply One Hot Encoding on the testing data
X_test_encoded = pd.get_dummies(X_test, columns=["Geomorphology", "Lithology", "LULC"])
X_test_encoded.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 678 entries, 536 to 54
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   Relief_Amplitude                      678 non-null    float64 
 1   Dist_to_River                         678 non-null    float64 
 2   TWI                                   678 non-null    float64 
 3   Rainfall                              678 non-null    float64 
 4   Clay_Content                          678 non-null    float64 
 5   STI                                   678 non-null    float64 
 6   TRI                                   678 non-null    float64 
 7   TPI                                   678 non-null    float64 
 8   SPI                                   678 non-null    float64 
 9   NDVI                                  678 non-null    float64 
 10  Slope                                 678 non-null    float64 
 1

## **Reset the Index of the Dataframe and Series**

In [37]:
X_train_encoded.reset_index(drop=True, inplace=True)
X_test_encoded.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

## **Finalize the Training and Testing Data**

In [38]:
# Add the y_train to X_train_encoded
X_train_encoded["Flood"] = y_train
training_df = X_train_encoded

In [39]:
training_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1582 entries, 0 to 1581
Data columns (total 34 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   Relief_Amplitude                      1582 non-null   float64 
 1   Dist_to_River                         1582 non-null   float64 
 2   TWI                                   1582 non-null   float64 
 3   Rainfall                              1582 non-null   float64 
 4   Clay_Content                          1582 non-null   float64 
 5   STI                                   1582 non-null   float64 
 6   TRI                                   1582 non-null   float64 
 7   TPI                                   1582 non-null   float64 
 8   SPI                                   1582 non-null   float64 
 9   NDVI                                  1582 non-null   float64 
 10  Slope                                 1582 non-null   float64 
 

In [40]:
# Add the y_test to X_test_encoded
X_test_encoded["Flood"] = y_test
testing_df = X_test_encoded

In [41]:
testing_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 678 entries, 0 to 677
Data columns (total 34 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   Relief_Amplitude                      678 non-null    float64 
 1   Dist_to_River                         678 non-null    float64 
 2   TWI                                   678 non-null    float64 
 3   Rainfall                              678 non-null    float64 
 4   Clay_Content                          678 non-null    float64 
 5   STI                                   678 non-null    float64 
 6   TRI                                   678 non-null    float64 
 7   TPI                                   678 non-null    float64 
 8   SPI                                   678 non-null    float64 
 9   NDVI                                  678 non-null    float64 
 10  Slope                                 678 non-null    float64 
 11

In [42]:
training_df.head()

Unnamed: 0,Relief_Amplitude,Dist_to_River,TWI,Rainfall,Clay_Content,STI,TRI,TPI,SPI,NDVI,...,Lithology_Fe_Ox_S_Si_Cl,Lithology_S_Si_Cl,Lithology_S_Si_Cl_wi_Cal_Co,Lithology_S_Si_Gr,LULC_Agricultural_Field,LULC_Bare_Ground,LULC_Built_UP_Area,LULC_Natural_Vegetation,LULC_Waterbodies,Flood
0,0.142857,0.518128,0.165104,0.587614,0.785714,0.0,0.280636,0.580849,0.0,0.700116,...,0,1,0,0,1,0,0,0,0,1
1,0.0,0.128572,0.571387,0.344053,0.690476,0.0,0.0,0.501084,5.678198e-06,0.437787,...,0,1,0,0,1,0,0,0,0,1
2,0.061224,0.164325,0.149176,0.698571,0.761905,0.0,0.177382,0.533338,0.0,0.601291,...,0,1,0,0,1,0,0,0,0,1
3,0.102041,0.433816,0.446274,0.324463,0.714286,0.000881,0.263824,0.460808,3.046819e-05,0.616554,...,0,0,0,0,1,0,0,0,0,1
4,0.122449,0.628029,0.320437,0.359155,0.738095,7e-05,0.32795,0.456793,7.681468e-07,0.5729,...,0,0,0,0,1,0,0,0,0,0


In [43]:
testing_df.head()

Unnamed: 0,Relief_Amplitude,Dist_to_River,TWI,Rainfall,Clay_Content,STI,TRI,TPI,SPI,NDVI,...,Lithology_Fe_Ox_S_Si_Cl,Lithology_S_Si_Cl,Lithology_S_Si_Cl_wi_Cal_Co,Lithology_S_Si_Gr,LULC_Agricultural_Field,LULC_Bare_Ground,LULC_Built_UP_Area,LULC_Natural_Vegetation,LULC_Waterbodies,Flood
0,0.163265,0.017297,0.520829,0.088173,0.666667,0.00218,0.270604,0.382802,0.000144,0.713929,...,0,1,0,0,1,0,0,0,0,1
1,0.040816,0.008649,0.570254,0.87188,0.761905,0.000146,0.138092,0.494965,1.2e-05,0.39551,...,0,1,0,0,1,0,0,0,0,0
2,0.102041,0.079737,0.170232,0.475129,0.761905,0.0,0.237181,0.495553,0.0,0.373748,...,0,0,0,1,0,0,1,0,0,0
3,0.020408,0.017297,0.567649,0.370257,0.690476,0.000337,0.094671,0.536425,2.9e-05,0.735108,...,0,1,0,0,1,0,0,0,0,1
4,0.0,0.453789,0.50604,0.605048,0.785714,0.0,0.0,0.384819,1e-06,0.459768,...,0,0,0,1,0,0,0,1,0,0


## **Export the Training and Testing Data**

In [45]:
output_folder_csv = "D:\\Research Works\\Flood\\Flood_Risk_Zonation_of_Maldah\\Datasets\\CSVs\\"
output_folder_shp = "D:\\Research Works\\Flood\\Flood_Risk_Zonation_of_Maldah\\Datasets\\Shapefiles\\"

In [48]:
# Export as CSV files
# training_df.to_csv(output_folder_csv+"Training_Data.csv")
# testing_df.to_csv(output_folder_csv+"Testing_Data.csv")

In [49]:
# Export as SHP files
training_gdf = gpd.GeoDataFrame(training_df, geometry=training_df.geometry)
testing_gdf = gpd.GeoDataFrame(testing_df, geometry=testing_df.geometry)

# training_gdf.to_file(output_folder_shp+"Training_Data.shp", driver="ESRI Shapefile")
# testing_gdf.to_file(output_folder_shp+"Testing_Data.shp", driver="ESRI Shapefile")