In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [4]:
df = pd.read_csv('./df.csv')

In [5]:
# Spatial join district metadata with original df

# Load the county shapefile
counties = gpd.read_file("./Data/coordinates/VILLAGE_NLSC_1130807.shp") 

# Set CRS file according to documentation
counties = counties.to_crs(3824)

# Aggregate the POLYGON object to the town level before using it further
towns = counties.dissolve(by=["COUNTYNAME", "TOWNNAME"], as_index=False)

# Create 'district' column in towns
towns['district'] = towns['COUNTYNAME'].astype(str).str.cat(towns['TOWNNAME'].astype(str))

# Convert flood data into a GeoDataFrame
df['geometry'] = gpd.points_from_xy(df['Longitude'], df['Latitude']) 
df_geo = gpd.GeoDataFrame(df, geometry='geometry', crs=3824)

# Perform spatial join to get county names
flood_with_location = gpd.sjoin(df_geo, towns, how="inner", predicate="within")

# Extract district name and append back to original df
df['county'] = flood_with_location['COUNTYNAME']
df['town'] = flood_with_location['TOWNNAME']
df['vil'] = flood_with_location['VILLNAME']
df['district'] = df['county'].astype(str).str.cat(df['town'].astype(str))

# Merge town-level geometries into df
df = df.drop(columns=['geometry'])
df = df.merge(towns[['district', 'geometry']], on='district', how='inner')

In [6]:
df.head()

Unnamed: 0,station_id,timestamp,value,Longitude,Latitude,SIUnit,county,town,vil,district,geometry
0,d83ac636-3d28-43fe-96a9-5c33dde8aebe,2022-07-21 00:08:57.2,0.001,120.691,23.9032,cm,南投縣,南投市,軍功里,南投縣南投市,"POLYGON ((120.70056 23.887, 120.70054 23.88697..."
1,d83ac636-3d28-43fe-96a9-5c33dde8aebe,2022-07-21 00:18:57.382,0.001,120.691,23.9032,cm,南投縣,南投市,軍功里,南投縣南投市,"POLYGON ((120.70056 23.887, 120.70054 23.88697..."
2,d83ac636-3d28-43fe-96a9-5c33dde8aebe,2022-07-21 00:28:58.358,0.001,120.691,23.9032,cm,南投縣,南投市,軍功里,南投縣南投市,"POLYGON ((120.70056 23.887, 120.70054 23.88697..."
3,d83ac636-3d28-43fe-96a9-5c33dde8aebe,2022-07-21 00:38:58.793,0.001,120.691,23.9032,cm,南投縣,南投市,軍功里,南投縣南投市,"POLYGON ((120.70056 23.887, 120.70054 23.88697..."
4,d83ac636-3d28-43fe-96a9-5c33dde8aebe,2022-07-21 00:48:59.713,0.001,120.691,23.9032,cm,南投縣,南投市,軍功里,南投縣南投市,"POLYGON ((120.70056 23.887, 120.70054 23.88697..."


In [None]:
# Merge district-level area data

area = pd.read_excel('./Data/area.xlsx')
df = df.merge(area, on='district', how='inner')

In [16]:
# Create scaling factor using the number of villages in each district

village_counts = counties.groupby(["COUNTYNAME", "TOWNNAME"], as_index=False)["VILLNAME"].nunique()
village_counts.rename(columns={"VILLNAME": "factor"}, inplace=True)
village_counts["district"] = village_counts["COUNTYNAME"] + village_counts["TOWNNAME"]

df = df.merge(village_counts[["district", "factor"]], on="district", how="left")

In [17]:
# Identify flood events

# Time gap threshold and depth threshold
TIME_GAP_THRESHOLD = pd.Timedelta(hours=24)
DEPTH_THRESHOLD = 50
DEPTH_OUTLIER = 300

# Convert to timestamp format
df['timestamp'] = df['timestamp'].str.split('.').str[0] 
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Sort data by location and timestamp
df = df.sort_values(by=['district', 'timestamp'])

# Identify Flood Incidents (Group by district and timestamp while considering station_id)
df['time_diff'] = df.groupby(['district'])['timestamp'].diff()
df['new_incident'] = (df['time_diff'] > TIME_GAP_THRESHOLD).fillna(False)

# Assign an incident group at the district level, ignoring different station IDs
df['incident_group'] = df.groupby(['district'])['new_incident'].cumsum()

# Assign a unique ID based on district and start time of the incident
incident_keys = df.groupby(['district', 'incident_group'])['timestamp'].transform('min').astype(str) + '_' + df['district']
df['incident_id'] = pd.factorize(incident_keys)[0] + 1  # Assign unique numeric ID

# Aggregate incidents, merging across stations in the same district
flood_incidents = df.groupby(['district', 'incident_id']).agg(
    start_time=('timestamp', 'min'),
    end_time=('timestamp', 'max'),
    min_flood_depth=('value', 'min'),
    max_flood_depth=('value', 'max'),
    avg_flood_depth=('value', 'mean'),
    area=('area', 'first'),
    county=('county', 'first'),
    town=('town', 'first'),
    vil=('vil', 'first'),
    geometry=('geometry', 'first'),
    factor=('factor', 'first')
).reset_index()

# Filter incidents with outlier flood depth and exceeding the threshold
flood_incidents = flood_incidents[
    (flood_incidents['max_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['avg_flood_depth'] > DEPTH_THRESHOLD)
]

In [18]:
# Drop NAs
print("Number of missing values per column before dropping:")
print(flood_incidents.isna().sum())  # Counts NaN values for each column

flood_incidents = flood_incidents.dropna()

Number of missing values per column before dropping:
district           0
incident_id        0
start_time         0
end_time           0
min_flood_depth    0
max_flood_depth    0
avg_flood_depth    0
area               1
county             1
town               1
vil                1
geometry           1
factor             1
dtype: int64


In [19]:
flood_incidents.head()

Unnamed: 0,district,incident_id,start_time,end_time,min_flood_depth,max_flood_depth,avg_flood_depth,area,county,town,vil,geometry,factor
274,嘉義市東區,275,2022-10-01 09:00:00,2022-10-01 10:27:48,294.2,295.1,294.9,30155600.0,嘉義市,東區,仁義里,"POLYGON ((120.45899 23.4542, 120.45889 23.4541...",39.0
468,嘉義縣六腳鄉,469,2020-03-21 20:09:29,2020-03-21 20:09:29,77.3,77.3,77.3,62261900.0,嘉義縣,六腳鄉,古林村,"POLYGON ((120.28176 23.49113, 120.28066 23.491...",25.0
470,嘉義縣六腳鄉,471,2020-04-13 21:26:57,2020-04-13 21:26:57,75.7,75.7,75.7,62261900.0,嘉義縣,六腳鄉,古林村,"POLYGON ((120.28176 23.49113, 120.28066 23.491...",25.0
473,嘉義縣六腳鄉,474,2020-05-06 22:38:20,2020-05-06 22:38:20,78.0,78.0,78.0,62261900.0,嘉義縣,六腳鄉,古林村,"POLYGON ((120.28176 23.49113, 120.28066 23.491...",25.0
534,嘉義縣六腳鄉,535,2023-08-30 07:40:03,2023-08-30 12:01:43,7.9,287.4,231.4,62261900.0,嘉義縣,六腳鄉,古林村,"POLYGON ((120.28176 23.49113, 120.28066 23.491...",25.0


In [20]:
# Export as csv
flood_incidents.to_csv("flood_incidents.csv", index=False)