In [7]:
import os
import pandas as pd
import requests
import zipfile
from io import StringIO

In [16]:
def extract_and_process_data(root_folder):
    all_data = []
    
    # Traverse through the nested folder structure
    for year in os.listdir(root_folder):
        year_path = os.path.join(root_folder, year)
        if not os.path.isdir(year_path):
            continue
        
        for month in os.listdir(year_path):
            month_path = os.path.join(year_path, month)
            if not os.path.isdir(month_path):
                continue
            
            first_zip = True
            for file in os.listdir(month_path):
                if file.endswith(".zip"):
                    zip_path = os.path.join(month_path, file)
                    if first_zip:
                        print(f"Processing Year/Month: {month}")
                        first_zip = False
                    
                    with zipfile.ZipFile(zip_path, 'r') as z:
                        for filename in z.namelist():
                            with z.open(filename) as f:
                                df = pd.read_csv(f, delimiter=',')
                                
                                # Filter the required data
                                df_filtered = df[
                                    (df["PQ_unit"] == "cm") & 
                                    (df["PQ_name"] == "淹水深度") & 
                                    (df["value"] > 0)
                                ][["station_id", "timestamp", "value"]]
                                
                                all_data.append(df_filtered)
    
    # Combine all data into a single DataFrame
    final_df = pd.concat(all_data, ignore_index=True)
    return final_df

root_folder = './Data/raw_data'
records = extract_and_process_data(root_folder)

Processing Year/Month: 202207
Processing Year/Month: 202209
Processing Year/Month: 202208
Processing Year/Month: 202201
Processing Year/Month: 202206
Processing Year/Month: 202212
Processing Year/Month: 202203
Processing Year/Month: 202204
Processing Year/Month: 202205
Processing Year/Month: 202202
Processing Year/Month: 202211
Processing Year/Month: 202210
Processing Year/Month: 201907
Processing Year/Month: 201909
Processing Year/Month: 201908
Processing Year/Month: 201906
Processing Year/Month: 201901
Processing Year/Month: 201912
Processing Year/Month: 201904
Processing Year/Month: 201903
Processing Year/Month: 201902
Processing Year/Month: 201905
Processing Year/Month: 201911
Processing Year/Month: 201910
Processing Year/Month: 202112
Processing Year/Month: 202107
Processing Year/Month: 202109
Processing Year/Month: 202108
Processing Year/Month: 202101
Processing Year/Month: 202106
Processing Year/Month: 202111
Processing Year/Month: 202110
Processing Year/Month: 202103
Processing

  final_df = pd.concat(all_data, ignore_index=True)


In [70]:
# Check for duplicates in records
num_duplicates = records.duplicated().sum()
print(f"Number of duplicate rows in sensors: {num_duplicates}")
print("Duplicate rows in sensors:")
records[records.duplicated()]

# Drop duplicates in records
records = records.drop_duplicates()

Number of duplicate rows in sensors: 0
Duplicate rows in sensors:


In [93]:
records

Unnamed: 0,station_id,timestamp,value
0,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:41:24,4.100000
1,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:47:24,4.300000
2,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:53:24,4.100000
3,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:56:24,4.100000
4,9e7e1b42-afba-4875-93b9-5a3b9dc401a8,2022-07-02 18:10:43,4.600000
...,...,...,...
2496052,a2015019-1ecc-48a2-938d-393a289e7a9a,2020-12-11 10:05:17.262,1.022019
2496099,ae9eb0c9-a435-4b9a-b5e2-08d00c43b231,2020-12-11 07:30:00,1.500928
2496100,ae9eb0c9-a435-4b9a-b5e2-08d00c43b231,2020-12-11 08:50:00,1.516509
2496237,039025b9-0b8e-4683-b143-29cfaa9a86bc,2020-12-11 09:41:31,2.200000


In [94]:
sensors = pd.read_csv('./Data/sensors.csv')
sensors = sensors.drop(columns=['ciOrgname', 'OrgName', 'Code', 'PQ_id', 'FullName', 'Description', 'ciCategory', 'CategoryInfos_Name', 
                                'Address', 'PQ_name'])
sensors.drop(sensors[sensors['SIUnit  '] != 'cm'].index, inplace=True)

In [95]:
# Check for duplicates in sensors
num_duplicates = sensors.duplicated().sum()
print(f"Number of duplicate rows in sensors: {num_duplicates}")
print("Duplicate rows in sensors:")
sensors[sensors.duplicated()]

# Drop duplicates in sensors
sensors = sensors.drop_duplicates()

Number of duplicate rows in sensors: 19
Duplicate rows in sensors:


In [96]:
sensors

Unnamed: 0,station_id,station_name,Longitude,Latitude,SIUnit
0,648c0721-9ae3-4a3b-9007-31dd06a5f293,CYC102 朴子市新寮里新寮社區,120.241250,23.450130,cm
1,b320d298-d3aa-4954-874a-79696f550efa,CYC105 東石鄉西崙村栗仔崙(磚仔窯),120.188995,23.428696,cm
2,c7c0c173-be6c-4fd2-b743-921d987e7330,CYC121 太保市埤鄉里埤麻腳社區,120.392550,23.483112,cm
3,bc5af470-def9-4712-95da-8cc29c35fd60,CYC104 東石鄉鰲鼓村四股社區,120.160995,23.508854,cm
4,54c2b021-edc6-418f-bff5-ec96067b24e6,CYC127 水上鄉內溪村民生社區,120.433920,23.441912,cm
...,...,...,...,...,...
1975,0e5ee0a0-81e2-461f-be9e-697135b3ce4c,清水區五權南路99號,120.559340,24.275707,cm
1976,7dcd3796-ffc5-4a7c-bbe3-276a905a71b9,大甲區順天路‧中山路一段,120.627330,24.350716,cm
1977,fb2ecdb0-c037-4907-95e2-9cf02d76c84c,CYC101 朴子市永和里福山社區,120.229220,23.452310,cm
1978,cf450c95-de25-4724-8ae2-4beb1baa1e29,南屯區文山路與精科東路交叉口,120.606240,24.145490,cm


In [111]:
# Left-join dataframes on station_id
df = records.merge(sensors, on='station_id', how='left')
df

Unnamed: 0,station_id,timestamp,value,station_name,Longitude,Latitude,SIUnit
0,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:41:24,4.100000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
1,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:47:24,4.300000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
2,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:53:24,4.100000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
3,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:56:24,4.100000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
4,9e7e1b42-afba-4875-93b9-5a3b9dc401a8,2022-07-02 18:10:43,4.600000,CYC013 布袋鎮岑海里A站,120.14970,23.379630,cm
...,...,...,...,...,...,...,...
2241003,a2015019-1ecc-48a2-938d-393a289e7a9a,2020-12-11 10:05:17.262,1.022019,瓦磘村_保生宮,120.35272,23.652205,cm
2241004,ae9eb0c9-a435-4b9a-b5e2-08d00c43b231,2020-12-11 07:30:00,1.500928,靖興里_鄰近茄苳腳平交道,120.47167,23.642970,cm
2241005,ae9eb0c9-a435-4b9a-b5e2-08d00c43b231,2020-12-11 08:50:00,1.516509,靖興里_鄰近茄苳腳平交道,120.47167,23.642970,cm
2241006,039025b9-0b8e-4683-b143-29cfaa9a86bc,2020-12-11 09:41:31,2.200000,,,,


In [112]:
# Drop NAs
print("Number of missing values per column before dropping:")
print(df.isna().sum())  # Counts NaN values for each column

print("\nTotal missing values in DataFrame:", df.isna().sum().sum())

df = df.dropna(subset=['Longitude', 'Latitude'])

Number of missing values per column before dropping:
station_id           0
timestamp            0
value                0
station_name    456852
Longitude       456852
Latitude        456852
SIUnit          456852
dtype: int64

Total missing values in DataFrame: 1827408


In [113]:
df

Unnamed: 0,station_id,timestamp,value,station_name,Longitude,Latitude,SIUnit
0,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:41:24,4.100000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
1,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:47:24,4.300000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
2,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:53:24,4.100000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
3,beace064-358c-42a3-8f2a-31144f714be8,2022-07-02 18:56:24,4.100000,CYC005 布袋鎮岑海里B站,120.15474,23.379100,cm
4,9e7e1b42-afba-4875-93b9-5a3b9dc401a8,2022-07-02 18:10:43,4.600000,CYC013 布袋鎮岑海里A站,120.14970,23.379630,cm
...,...,...,...,...,...,...,...
2241001,5abec613-f581-4bf9-b41c-dc0619c950c4,2020-12-11 11:30:00,1.545227,長南村_和平街,120.30970,23.654995,cm
2241002,5abec613-f581-4bf9-b41c-dc0619c950c4,2020-12-11 15:03:18.016,1.504207,長南村_和平街,120.30970,23.654995,cm
2241003,a2015019-1ecc-48a2-938d-393a289e7a9a,2020-12-11 10:05:17.262,1.022019,瓦磘村_保生宮,120.35272,23.652205,cm
2241004,ae9eb0c9-a435-4b9a-b5e2-08d00c43b231,2020-12-11 07:30:00,1.500928,靖興里_鄰近茄苳腳平交道,120.47167,23.642970,cm


In [114]:
# Number of unique stations
df['station_id'].value_counts()

station_id
1ae437e3-0b0c-4334-85ca-ae1f7c5c52eb    26984
e23170a1-6dbd-4272-bdff-deb308e26fd9    26796
acd07d9c-a3c2-4295-8023-3055c27a96b5    26224
0a72687b-3b06-41f5-b1d2-6b283f016f2b    25798
4958ecff-7a87-4630-ad46-bc7698269fb6    25718
                                        ...  
ae35af86-5ad0-4fb6-a913-ddb5c221ed96        1
c7c0c173-be6c-4fd2-b743-921d987e7330        1
932ba2df-f554-4db4-8520-9c8ac66d9e72        1
5e5f6ecf-0710-435b-94b3-ac5a261eb16c        1
0b5cefd1-ca04-43aa-ab68-cc9688a90db5        1
Name: count, Length: 1032, dtype: int64

In [117]:
# Distribution of flood depth
df['value'].describe()

count    1.784156e+06
mean     8.678600e+00
std      9.521959e+02
min      1.000000e-04
25%      5.000000e-04
50%      1.170000e-02
75%      4.500000e+00
max      6.038774e+05
Name: value, dtype: float64

In [120]:
# Export to csv
df.to_csv("df.csv", index=False)