In [12]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [181]:
df = pd.read_csv('./df.csv')

In [183]:
# Spatial join district metadata with original df

# Load the county shapefile
county_shapefile_path = "./Data/coordinates/VILLAGE_NLSC_1130807.shp"
counties = gpd.read_file(county_shapefile_path) 

# Convert flood data into a GeoDataFrame
df['geometry'] = df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
flood_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Ensure CRS of counties matches flood data
counties = counties.to_crs(flood_gdf.crs)

# Perform spatial join to get county names
flood_with_county = gpd.sjoin(flood_gdf, counties, how="left", predicate="within")

# Extract district name and append back to original df
df['county'] = flood_with_county['COUNTYNAME']
df['town'] = flood_with_county['TOWNNAME']
df['vil'] = flood_with_county['VILLNAME']
df['full_address'] = flood_with_county['Name']

# Obtain the area of each district (in square meters)
counties = counties.to_crs(epsg=3857)
counties["area_sqm"] = counties.geometry.area

# Merge district areas with flood incidents data
df = df.merge(counties.rename(columns={'Name': 'full_address'})[['full_address', 'area_sqm']], on='full_address', how='left')

In [186]:
# Time gap threshold and depth threshold
TIME_GAP_THRESHOLD = pd.Timedelta(hours=2)
DEPTH_THRESHOLD = 10
DEPTH_OUTLIER = 600  

# Ensure timestamp format
df['timestamp'] = df['timestamp'].str.split('.').str[0] 
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Sort data by location and timestamp
df = df.sort_values(by=['full_address', 'timestamp'])

# Identify Flood Incidents (Group by full_address and timestamp while considering station_id)
df['time_diff'] = df.groupby(['full_address'])['timestamp'].diff()
df['new_incident'] = (df['time_diff'] > TIME_GAP_THRESHOLD).fillna(False)

# Assign an incident group at the district level, ignoring different station IDs
df['incident_group'] = df.groupby(['full_address'])['new_incident'].cumsum()

# Assign a unique ID based on full_address and start time of the incident
incident_keys = df.groupby(['full_address', 'incident_group'])['timestamp'].transform('min').astype(str) + '_' + df['full_address']
df['incident_id'] = pd.factorize(incident_keys)[0] + 1  # Assign unique numeric ID

# Aggregate incidents, merging across stations in the same district
flood_incidents = df.groupby(['full_address', 'incident_id']).agg(
    start_time=('timestamp', 'min'),
    end_time=('timestamp', 'max'),
    min_flood_depth=('value', 'min'),
    max_flood_depth=('value', 'max'),
    avg_flood_depth=('value', 'mean'),
    area_sqm=('area_sqm', 'first'),
    county=('county', 'first'),
    town=('town', 'first'),
    vil=('vil', 'first'),
).reset_index()

# Filter incidents with outlier flood depth and exceeding the threshold
flood_incidents = flood_incidents[
    (flood_incidents['avg_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['min_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['max_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['avg_flood_depth'] > DEPTH_THRESHOLD)
]

# Sort incidents by district
flood_incidents = flood_incidents.sort_values(by=['full_address'])

In [187]:
flood_incidents

Unnamed: 0,full_address,incident_id,start_time,end_time,min_flood_depth,max_flood_depth,avg_flood_depth,area_sqm,county,town,vil
14,嘉義市東區太平里,15,2021-06-01 02:10:00,2021-06-01 02:30:00,5.2,15.0,10.966667,7.223208e+05,嘉義市,東區,太平里
15,嘉義市東區太平里,16,2021-06-12 16:50:00,2021-06-12 17:10:00,6.5,24.9,13.400000,7.223208e+05,嘉義市,東區,太平里
23,嘉義市東區太平里,24,2022-08-04 16:05:00,2022-08-04 17:00:00,6.4,30.7,23.366667,7.223208e+05,嘉義市,東區,太平里
38,嘉義市東區安寮里,39,2022-08-04 16:02:00,2022-08-04 17:30:00,6.9,37.8,31.425000,3.269794e+05,嘉義市,東區,安寮里
36,嘉義市東區安寮里,37,2022-06-29 13:44:00,2022-06-29 14:24:00,5.0,33.5,24.887500,3.269794e+05,嘉義市,東區,安寮里
...,...,...,...,...,...,...,...,...,...,...,...
14029,高雄市鳳山區忠義里,14030,2021-09-05 18:49:00,2021-09-05 18:59:00,5.0,16.0,10.500000,3.031024e+05,高雄市,鳳山區,忠義里
14052,高雄市鼓山區厚生里,14053,2021-03-22 15:53:00,2021-03-22 15:53:00,22.0,22.0,22.000000,2.491359e+05,高雄市,鼓山區,厚生里
14053,高雄市鼓山區厚生里,14054,2021-03-25 13:21:00,2021-03-25 15:01:00,180.0,191.0,185.272727,2.491359e+05,高雄市,鼓山區,厚生里
14056,高雄市鼓山區綠川里,14057,2022-11-10 13:08:00,2022-11-10 13:08:00,14.5,14.5,14.500000,1.109590e+05,高雄市,鼓山區,綠川里


In [188]:
# Export as csv
flood_incidents.to_csv("flood_incidents.csv", index=False)