In [12]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [148]:
df = pd.read_csv('./df.csv')

In [149]:
# Spatial join district metadata with original df

# Load the county shapefile
county_shapefile_path = "./Data/coordinates/VILLAGE_NLSC_1130807.shp"
counties = gpd.read_file(county_shapefile_path) 

# Convert flood data into a GeoDataFrame
df['geometry'] = df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
flood_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Ensure CRS of counties matches flood data
counties = counties.to_crs(flood_gdf.crs)

# Perform spatial join to get county names
flood_with_county = gpd.sjoin(flood_gdf, counties, how="left", predicate="within")

# Extract district name and append back to original df
df['county'] = flood_with_county['COUNTYNAME']
df['town'] = flood_with_county['TOWNNAME']
df['vil'] = flood_with_county['VILLNAME']
df['full_address'] = flood_with_county['Name']

# Obtain the area of each district (in square meters)
counties["area_sqm"] = counties.geometry.area

# Merge district areas with flood incidents data
df = df.merge(counties.rename(columns={'Name': 'full_address'})[['full_address', 'area_sqm']], on='full_address', how='left')

In [155]:
# Time gap threshold and depth threshold
TIME_GAP_THRESHOLD = pd.Timedelta(hours=2)
DEPTH_THRESHOLD = 50
DEPTH_OUTLIER = 500  

# Ensure timestamp format
df['timestamp'] = df['timestamp'].str.split('.').str[0] 
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort data by location and timestamp
df = df.sort_values(by=['full_address', 'timestamp'])

# Identify Flood Incidents (Group by full_address and timestamp while considering station_id)
df['time_diff'] = df.groupby(['full_address'])['timestamp'].diff()
df['new_incident'] = (df['time_diff'] > TIME_GAP_THRESHOLD).fillna(False)

# Assign an incident group at the district level, ignoring different station IDs
df['incident_group'] = df.groupby(['full_address'])['new_incident'].cumsum()

# Assign a unique ID based on full_address and start time of the incident
incident_keys = df.groupby(['full_address', 'incident_group'])['timestamp'].transform('min').astype(str) + '_' + df['full_address']
df['incident_id'] = pd.factorize(incident_keys)[0] + 1  # Assign unique numeric ID

# Aggregate incidents, merging across stations in the same district
flood_incidents = df.groupby(['full_address', 'incident_id']).agg(
    start_time=('timestamp', 'min'),
    end_time=('timestamp', 'max'),
    min_flood_depth=('value', 'min'),
    max_flood_depth=('value', 'max'),
    avg_flood_depth=('value', 'mean'),
    area_sqm=('area_sqm', 'first')
).reset_index()

# Filter incidents with outlier flood depth and exceeding the threshold
flood_incidents = flood_incidents[
    (flood_incidents['avg_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['min_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['max_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['avg_flood_depth'] > DEPTH_THRESHOLD)
]

# Sort incidents by district
flood_incidents = flood_incidents.sort_values(by=['full_address'])

In [156]:
flood_incidents

Unnamed: 0,full_address,incident_id,start_time,end_time,min_flood_depth,max_flood_depth,avg_flood_depth,area_sqm
66,嘉義市東區安業里,67,2022-10-01 09:00:00,2022-10-01 10:27:48,294.2,295.1,294.900000,5.474204e+05
75,嘉義市東區林森里,76,2021-02-26 17:07:49,2021-02-26 17:32:49,311.9,312.0,311.916667,3.629155e+05
76,嘉義市東區林森里,77,2021-03-02 11:17:50,2021-03-03 14:46:54,312.5,313.1,312.848338,3.629155e+05
114,嘉義市東區頂寮里,115,2021-02-25 16:27:42,2021-02-25 16:27:42,313.2,313.2,313.200000,3.076773e+05
115,嘉義市東區頂寮里,116,2021-02-26 17:07:42,2021-02-26 17:32:42,313.0,313.0,313.000000,3.076773e+05
...,...,...,...,...,...,...,...,...
23335,高雄市美濃區泰安里,23336,2020-09-09 02:55:02,2020-09-09 13:21:51,143.0,156.0,147.344828,6.934116e+05
23408,高雄市路竹區頂寮里,23409,2021-03-12 14:25:02,2021-03-12 17:56:04,9.0,199.0,94.222222,2.712282e+06
23409,高雄市路竹區頂寮里,23410,2021-05-05 16:06:31,2021-05-05 16:18:04,198.0,200.0,199.000000,2.712282e+06
23442,高雄市鳥松區鳥松里,23443,2021-01-28 00:06:05,2021-01-28 10:46:05,68.0,96.0,81.953846,9.333304e+06


In [158]:
# Export as csv
flood_incidents.to_csv("flood_incidents.csv", index=False)