In [12]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [3]:
df = pd.read_csv('./df.csv')

In [22]:
# Spatial join district data with original df

# Load the county shapefile
county_shapefile_path = "./Data/coordinates/VILLAGE_NLSC_1130807.shp"
counties = gpd.read_file(county_shapefile_path) # Ensure CRS of counties matches flood data 
counties = counties.to_crs(flood_gdf.crs) 

# Convert flood data into a GeoDataFrame
df['geometry'] = df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
flood_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Perform spatial join to get county names
flood_with_county = gpd.sjoin(flood_gdf, counties, how="left", predicate="within")

# Extract district name and append back to original df
df['county'] = flood_with_county['COUNTYNAME']
df['town'] = flood_with_county['TOWNNAME']
df['vil'] = flood_with_county['VILLNAME']
df['full_address'] = flood_with_county['Name']

In [None]:
# Step 1: Ensure timestamp is in datetime format
df['timestamp'] = df['timestamp'].str.split('.').str[0] 
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [42]:
df

Unnamed: 0,station_id,timestamp,value,station_name,Longitude,Latitude,SIUnit,geometry,county,town,vil,full_address,time_diff,new_incident,incident_id
529759,43e92759-bbdd-401e-b8fe-d45a66ada9b9,2021-03-23 15:58:55,9.6,東區北門里長榮街與成仁街口,120.451004,23.483536,cm,POINT (120.451004 23.483536),嘉義市,東區,北門里,嘉義市東區北門里,NaT,False,0
529760,43e92759-bbdd-401e-b8fe-d45a66ada9b9,2021-03-23 16:00:00,9.6,東區北門里長榮街與成仁街口,120.451004,23.483536,cm,POINT (120.451004 23.483536),嘉義市,東區,北門里,嘉義市東區北門里,0 days 00:01:05,False,0
555090,43e92759-bbdd-401e-b8fe-d45a66ada9b9,2021-05-20 15:56:31,7.0,東區北門里長榮街與成仁街口,120.451004,23.483536,cm,POINT (120.451004 23.483536),嘉義市,東區,北門里,嘉義市東區北門里,57 days 23:56:31,True,1
412080,43e92759-bbdd-401e-b8fe-d45a66ada9b9,2021-07-24 14:51:13,6.5,東區北門里長榮街與成仁街口,120.451004,23.483536,cm,POINT (120.451004 23.483536),嘉義市,東區,北門里,嘉義市東區北門里,64 days 22:54:42,True,1
24130,43e92759-bbdd-401e-b8fe-d45a66ada9b9,2022-06-17 11:04:30,7.6,東區北門里長榮街與成仁街口,120.451004,23.483536,cm,POINT (120.451004 23.483536),嘉義市,東區,北門里,嘉義市東區北門里,327 days 20:13:17,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32219,404195f6-2b8e-48da-9afa-bc98985ee157,2022-12-17 12:00:00,3.3,龍水里馬卡道路,120.281530,22.646430,cm,POINT (120.28153 22.64643),高雄市,鼓山區,龍水里,高雄市鼓山區龍水里,0 days 00:08:00,False,0
32220,404195f6-2b8e-48da-9afa-bc98985ee157,2022-12-17 12:10:00,3.3,龍水里馬卡道路,120.281530,22.646430,cm,POINT (120.28153 22.64643),高雄市,鼓山區,龍水里,高雄市鼓山區龍水里,0 days 00:10:00,False,0
32221,404195f6-2b8e-48da-9afa-bc98985ee157,2022-12-17 12:20:00,3.0,龍水里馬卡道路,120.281530,22.646430,cm,POINT (120.28153 22.64643),高雄市,鼓山區,龍水里,高雄市鼓山區龍水里,0 days 00:10:00,False,0
32222,404195f6-2b8e-48da-9afa-bc98985ee157,2022-12-17 23:07:00,3.0,龍水里馬卡道路,120.281530,22.646430,cm,POINT (120.28153 22.64643),高雄市,鼓山區,龍水里,高雄市鼓山區龍水里,0 days 10:47:00,True,1


In [138]:
# Time gap threshold and depth threshold
TIME_GAP_THRESHOLD = pd.Timedelta(hours=2)
DEPTH_THRESHOLD = 50
DEPTH_OUTLIER = 500  # Define this based on your data

# Sort data by location and timestamp
df = df.sort_values(by=['full_address', 'timestamp'])

# Identify Flood Incidents (Group by full_address and timestamp while considering station_id)
df['time_diff'] = df.groupby(['full_address'])['timestamp'].diff()
df['new_incident'] = (df['time_diff'] > TIME_GAP_THRESHOLD).fillna(False)

# Assign an incident group at the district level, ignoring different station IDs
df['incident_group'] = df.groupby(['full_address'])['new_incident'].cumsum()

# Assign a unique ID based on full_address and start time of the incident
incident_keys = df.groupby(['full_address', 'incident_group'])['timestamp'].transform('min').astype(str) + '_' + df['full_address']
df['incident_id'] = pd.factorize(incident_keys)[0] + 1  # Assign unique numeric ID

# Aggregate incidents, merging across stations in the same district
flood_incidents = df.groupby(['full_address', 'incident_id']).agg(
    start_time=('timestamp', 'min'),
    end_time=('timestamp', 'max'),
    min_flood_depth=('value', 'min'),
    max_flood_depth=('value', 'max'),
    avg_flood_depth=('value', 'mean'),
).reset_index()

# Filter incidents with outlier flood depth and exceeding the threshold
flood_incidents = flood_incidents[
    (flood_incidents['avg_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['min_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['max_flood_depth'] < DEPTH_OUTLIER) &
    (flood_incidents['avg_flood_depth'] > DEPTH_THRESHOLD)
]

# Sort incidents by district
flood_incidents = flood_incidents.sort_values(by=['full_address'])

In [139]:
flood_incidents

Unnamed: 0,full_address,incident_id,start_time,end_time,min_flood_depth,max_flood_depth,avg_flood_depth
66,嘉義市東區安業里,67,2022-10-01 09:00:00,2022-10-01 10:27:48,294.2,295.1,294.900000
75,嘉義市東區林森里,76,2021-02-26 17:07:49,2021-02-26 17:32:49,311.9,312.0,311.916667
76,嘉義市東區林森里,77,2021-03-02 11:17:50,2021-03-03 14:46:54,312.5,313.1,312.848338
114,嘉義市東區頂寮里,115,2021-02-25 16:27:42,2021-02-25 16:27:42,313.2,313.2,313.200000
115,嘉義市東區頂寮里,116,2021-02-26 17:07:42,2021-02-26 17:32:42,313.0,313.0,313.000000
...,...,...,...,...,...,...,...
23335,高雄市美濃區泰安里,23336,2020-09-09 02:55:02,2020-09-09 13:21:51,143.0,156.0,147.344828
23408,高雄市路竹區頂寮里,23409,2021-03-12 14:25:02,2021-03-12 17:56:04,9.0,199.0,94.222222
23409,高雄市路竹區頂寮里,23410,2021-05-05 16:06:31,2021-05-05 16:18:04,198.0,200.0,199.000000
23442,高雄市鳥松區鳥松里,23443,2021-01-28 00:06:05,2021-01-28 10:46:05,68.0,96.0,81.953846


In [140]:
# Export as csv
flood_incidents.to_csv("flood_incidents.csv", index=False)