In [None]:
import pandas as pd
import numpy as np
import os 

from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import MinMaxScaler

mpl.rcParams["figure.dpi"] = 100

In [None]:
w_list = sorted(glob("../data/raw/water_data/*.csv"))
print(w_list)

In [None]:
data = []

for i in w_list[4:-1]:
    
    tmp = pd.read_csv(i)
    tmp = tmp.replace(" ", np.nan)
    tmp.drop('fw_1018680', axis=1, inplace=True)
    tmp['ymdhm'] = pd.to_datetime(tmp['ymdhm'], infer_datetime_format=True, errors='ignore')
    tmp['swl'] = tmp['swl'].replace(0, np.NaN)
    tmp['sfw'] = tmp['sfw'].replace(-0.01, np.NaN)
    tmp['sfw'] = tmp['sfw'].replace(0, np.NaN)
    tmp['ecpc'].loc[tmp['ecpc'] > 200] = np.NaN
    tmp['tototf'].loc[tmp['tototf'] < 0] = np.NaN
    tmp['tototf'].loc[tmp['tototf'] > 20000] = np.NaN
    
    tmp_grouped = tmp.groupby([tmp['ymdhm'].dt.year, tmp['ymdhm'].dt.month, tmp['ymdhm'].dt.day, tmp['ymdhm'].dt.hour]).mean()
    tmp_grouped.interpolate(method='linear', axis=0, inplace=True)

    tmp_grouped["year"] = tmp_grouped.index.get_level_values(0)
    tmp_grouped["month"] = tmp_grouped.index.get_level_values(1)
    tmp_grouped["day"] = tmp_grouped.index.get_level_values(2)
    tmp_grouped["hour"] = tmp_grouped.index.get_level_values(3)

    tmp_grouped.reset_index(drop=True)
    
    data.append(tmp_grouped)
    
    
df_hourly_hanriver = pd.concat(data)

In [None]:
df_hourly_hanriver = df_hourly_hanriver.loc[:, ['tide_level', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'wl_1018662', 'fw_1018662','wl_1018680', 'wl_1018683', 'fw_1018683', 'wl_1019630', 'fw_1019630', 'year', 'month', 'day', 'hour']]

In [None]:
path_rf = '../data/raw/rf_data/'

rf_list = os.listdir(path_rf)
rf_list_py = [file for file in rf_list if file.endswith('.csv')]

rain_df = pd.DataFrame()
for i in sorted(rf_list_py)[4:-1]:
    data2 = pd.read_csv(path_rf + i)
    rain_df = pd.concat([rain_df,data2])
    
rain_df = rain_df.reset_index(drop = True)

rain_df['ymdhm'] = pd.to_datetime(rain_df['ymdhm'], infer_datetime_format=True, errors='ignore')
rain_grouped = rain_df.groupby([rain_df['ymdhm'].dt.year, rain_df['ymdhm'].dt.month, rain_df['ymdhm'].dt.day, rain_df['ymdhm'].dt.hour]).mean()
rain_grouped.interpolate(method='linear', axis=0, inplace=True)
rain_grouped["year"] = rain_grouped.index.get_level_values(0)
rain_grouped["month"] = rain_grouped.index.get_level_values(1)
rain_grouped["day"] = rain_grouped.index.get_level_values(2)
rain_grouped["hour"] = rain_grouped.index.get_level_values(3)
rain_grouped.reset_index(drop=True, inplace=True)
rain_grouped = rain_grouped.loc[:, ['year', 'month', 'day', 'hour', 'rf_10184100', 'rf_10184110', 'rf_10184140']]
rain_grouped.describe()


In [None]:
df_merged = rain_grouped.merge(df_hourly_hanriver, on=['year', 'month', 'day', 'hour'], how='left')
df_merged.to_csv("../data/df_merged.csv", index=False)

In [None]:
df_merged["month"] = df_merged["month"] - 5
df_merged["day"] = df_merged["day"] - 1

min_max_scaler = MinMaxScaler()

### min-max 
fitted = min_max_scaler.fit(df_merged.iloc[:, 4:])
array_scaled_merged = min_max_scaler.transform(df_merged.iloc[:, 4:])

train_scaled_array = array_scaled_merged[df_merged['year'] != 2021, :]
df_train_total = pd.concat([df_merged.loc[df_merged['year'] != 2021, ['year', 'month', 'day', 'hour']], pd.DataFrame(train_scaled_array)], axis=1)

df_train_total.to_csv("../data/df_train_total.csv", index=False)

test_scaled_array = array_scaled_merged[df_merged['year'] == 2021, :]
df_test_total = pd.concat([df_merged.loc[df_merged['year'] == 2021, ['year', 'month', 'day', 'hour']].reset_index(drop=True), pd.DataFrame(test_scaled_array)], axis=1)

df_test_total.to_csv("../data/df_test_total.csv", index=False)

### Distribution-shift case

In [None]:
for y in [2016, 2017, 2018, 2019, 2020, 2021]:
    tmp_merged = df_merged.loc[df_merged["year"] == y, ]
    min_max_scaler = MinMaxScaler()
    tmp_merged.loc[tmp_merged["month"].isin([0, 1])]
    ### min-max 
    fitted = min_max_scaler.fit(tmp_merged.loc[tmp_merged["month"].isin([0, 1])].iloc[:, 4:])
    array_scaled_merged_ds = min_max_scaler.transform(tmp_merged.iloc[:, 4:])

    train_scaled_array = array_scaled_merged_ds[tmp_merged["month"].isin([0, 1]), :]
    df_train_total = pd.concat([tmp_merged.loc[tmp_merged["month"].isin([0, 1]), ['year', 'month', 'day', 'hour']].reset_index(drop=True), pd.DataFrame(train_scaled_array)], axis=1)

    df_train_total.to_csv("../data/df_train_total_ds_{}.csv".format(str(y)), index=False)

    test_scaled_array = array_scaled_merged_ds[tmp_merged["month"].isin([2, 3]), :]
    df_test_total = pd.concat([tmp_merged.loc[tmp_merged["month"].isin([2, 3]), ['year', 'month', 'day', 'hour']].reset_index(drop=True), pd.DataFrame(test_scaled_array)], axis=1)

    df_test_total.to_csv("../data/df_test_total_ds_{}.csv".format(str(y)), index=False)
    
    tmp_merged.loc[tmp_merged['month'].isin([0,1,2,3]), :].to_csv("../data/df_merged_ds_{}.csv".format(str(y)), index=False)
