# Advanced preprocessing

# Table of Contents
- Load data
- Merge duplicated rows in `fcst` by using mean
- Interpolate fcst
- Convert dtype of 'time' in `energy`
- Merge fcst, obs, and energy
- Fill missing values in using ffill

## Load data
- from basic_processing.ipynb

In [1]:
import pickle
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt

In [2]:
with open('pickles/dangjin_fcst_data.pkl','rb') as f:
    dangjin_fcst = pickle.load(f)
with open('pickles/dangjin_obs_data.pkl','rb') as f:
    dangjin_obs = pickle.load(f)
with open('pickles/ulsan_fcst_data.pkl','rb') as f:
    ulsan_fcst = pickle.load(f)
with open('pickles/ulsan_obs_data.pkl', 'rb') as f:
    ulsan_obs = pickle.load(f)

energy = pd.read_csv(glob('../original_dataset/energy.csv')[0])

## Merge duplicated rows in `fcst` by using mean

In [121]:
print(dangjin_fcst['time'].shape[0], dangjin_fcst['time'].unique().shape[0])
print(ulsan_fcst['time'].shape[0], ulsan_fcst['time'].unique().shape[0])

162208 8788
162208 8788


In [131]:
dangjin_fcst_unique = dangjin_fcst.groupby('time').mean()
ulsan_fcst_unique = ulsan_fcst.groupby('time').mean()

dangjin_fcst_unique = dangjin_fcst_unique.reset_index() # because time was set to index
ulsan_fcst_unique = ulsan_fcst_unique.reset_index()

In [132]:
dangjin_fcst_unique

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 15:00:00,0.0,60.0,7.300000,309.0,2.0
1,2018-03-01 18:00:00,-2.0,60.0,7.100000,314.0,1.0
2,2018-03-01 21:00:00,-2.0,60.0,6.966667,324.0,1.0
3,2018-03-02 00:00:00,-2.0,52.5,6.250000,336.0,1.0
4,2018-03-02 03:00:00,-3.4,55.0,5.200000,339.0,1.0
...,...,...,...,...,...,...
8783,2021-03-03 12:00:00,7.0,40.0,3.200000,187.0,1.0
8784,2021-03-03 15:00:00,8.0,40.0,4.500000,217.0,1.0
8785,2021-03-03 18:00:00,5.0,55.0,2.200000,210.0,1.0
8786,2021-03-03 21:00:00,1.0,80.0,1.900000,164.0,1.0


## Interpolate fcst
- code from @김진수
- bug?: last row deleted

In [89]:
# code from @김진수
# linear interpolation

def interpolation(df):

    df_copy = df.copy()
    var_names = df.columns

    total_s = list()
    time_list = list()
    
    for var_name in var_names:
        s = list()
        for i in range(df_copy.shape[0] - 1):
            timedeltas = df_copy["time"][i+1] - df_copy["time"][i]
            n_intervals = int(timedeltas / np.timedelta64(1, "h"))

            for j in range(n_intervals):
        
                if var_name == "time":
                    time_stamps = df_copy["time"][i] + timedeltas * j / n_intervals
                    time_list.append(time_stamps)
                else:
                    add_ = df_copy[var_name][i] + (df_copy[var_name][i+1] - df_copy[var_name][i]) / n_intervals * j
                    s.append(add_)

        if var_name == "time":
            time_list = np.array(time_list).reshape(-1,1)
            total_s.append(time_list)
        else:
            s = np.array(s).reshape(-1,1)
            total_s.append(s)

    total_s = np.array(total_s).T.reshape(-1, len(var_names))
    df_converted = pd.DataFrame(total_s, columns = var_names)

    return df_converted

In [143]:
dangjin_fcst_interpolated = interpolation(dangjin_fcst_unique)
ulsan_fcst_interpolated = interpolation(ulsan_fcst_unique)
dangjin_obs_interpolated = interpolation(dangjin_obs)
ulsan_obs_interpolated = interpolation(ulsan_obs)

## Convert dtype of 'time' in `energy`
- 'time' of `energy` is 'object'. 
- However, `pd.to_datetime` doesn't accepts '24:00:00' as an hour. Instead accepts '00:00:00'.
- Therefore, custom function is needed.

In [184]:
def to_datetime_for_energy(date_str):
    if date_str[11:13] == '24':
        date_str = date_str[0:11] + '00' + date_str[13:]
        return pd.to_datetime(date_str) + dt.timedelta(days=1)

    else:
        return pd.to_datetime(date_str)

print(to_datetime_for_energy('2018-03-01 24:00:00'))

2018-03-02 00:00:00


In [185]:
energy['time'] = energy['time'].apply(to_datetime_for_energy)

## Merge fcst, obs, and energy
- inner merge on 'time'

In [199]:
dangjin_merged_ = pd.merge(dangjin_fcst_interpolated, dangjin_obs_interpolated, on='time', suffixes=('_obs','_fcst'))
dangjin_merged = pd.merge(dangjin_merged_, energy.loc[:,['time','dangjin_floating','dangjin_warehouse','dangjin']], on='time')

ulsan_merged_ = pd.merge(ulsan_fcst_interpolated, ulsan_obs_interpolated, on='time', suffixes=('_obs','_fcst'))
ulsan_merged = pd.merge(ulsan_merged_, energy.loc[:,['time','ulsan']], on='time')

display(dangjin_merged)
display(ulsan_merged)

Unnamed: 0,time,Temperature_obs,Humidity_obs,WindSpeed_obs,WindDirection_obs,Cloud_obs,Temperature_fcst,Humidity_fcst,WindSpeed_fcst,WindDirection_fcst,Cloud_fcst,dangjin_floating,dangjin_warehouse,dangjin
0,2018-03-01 15:00:00,0,60,7.3,309,2,0.4,56,5,320,,641.0,496.0,672
1,2018-03-01 16:00:00,-0.666667,60,7.23333,310.667,1.66667,0.3,56,6,290,,536.0,391.0,546
2,2018-03-01 17:00:00,-1.33333,60,7.16667,312.333,1.33333,-0.5,59,4.6,320,,348.0,271.0,364
3,2018-03-01 18:00:00,-2,60,7.1,314,1,-1.3,62,5.4,290,,134.0,80.0,110
4,2018-03-01 19:00:00,-2,60,7.05556,317.333,1,-1.7,63,4.4,320,,11.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25611,2021-01-31 18:00:00,6.2,76,3.34,181,3.4,7.8,70,2,200,8,10.0,6.0,0
25612,2021-01-31 19:00:00,6.13333,77.9683,3.46476,178.667,3.6,6.7,75,1.5,200,8,0.0,0.0,0
25613,2021-01-31 20:00:00,6.06667,79.9365,3.58952,176.333,3.8,6.2,77,0.8,200,8,0.0,0.0,0
25614,2021-01-31 21:00:00,6,81.9048,3.71429,174,4,5.3,82,0.7,230,8,0.0,0.0,0


Unnamed: 0,time,Temperature_obs,Humidity_obs,WindSpeed_obs,WindDirection_obs,Cloud_obs,Temperature_fcst,Humidity_fcst,WindSpeed_fcst,WindDirection_fcst,Cloud_fcst,ulsan
0,2018-03-01 15:00:00,8,20,14,298,2,9.9,19,7.2,290,5,318
1,2018-03-01 16:00:00,6.66667,20,10.7667,298,2,9.2,17,6.7,290,1,258
2,2018-03-01 17:00:00,5.33333,20,7.53333,298,2,8.3,17,5.1,290,2,160
3,2018-03-01 18:00:00,4,20,4.3,298,2,6.6,19,2.3,360,,30
4,2018-03-01 19:00:00,3.66667,23.3333,3.52222,302.111,2,5.6,18,1.2,200,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25611,2021-01-31 18:00:00,9.2,59,2.74,204.6,3.4,9.3,43,3.4,180,0,8
25612,2021-01-31 19:00:00,8.81587,62.5079,2.98063,206.686,3.47302,8.8,50,2.5,200,5,0
25613,2021-01-31 20:00:00,8.43175,66.0159,3.22127,208.771,3.54603,8.7,49,3.9,200,1,0
25614,2021-01-31 21:00:00,8.04762,69.5238,3.4619,210.857,3.61905,8.4,51,2.4,230,7,0


## Fill missing values in using ffill
- fillna with bfill(backward fill)
- if bfill is not available, use ffill(forward fill) instead

In [None]:
# keep the original
dangjin_merged_with_na = dangjin_merged.copy()
ulsan_merged_with_na = ulsan_merged.copy()

In [211]:
# check missing values
print(dangjin_merged_with_na.isna().sum())
print(ulsan_merged_with_na.isna().sum())

time                     0
Temperature_obs          0
Humidity_obs             0
WindSpeed_obs            0
WindDirection_obs        0
Cloud_obs                0
Temperature_fcst        47
Humidity_fcst           43
WindSpeed_fcst          44
WindDirection_fcst      44
Cloud_fcst            4086
dangjin_floating        24
dangjin_warehouse       48
dangjin                  0
dtype: int64
time                     0
Temperature_obs          0
Humidity_obs             0
WindSpeed_obs            0
WindDirection_obs        0
Cloud_obs                0
Temperature_fcst         7
Humidity_fcst            2
WindSpeed_fcst           2
WindDirection_fcst       2
Cloud_fcst            1257
ulsan                    0
dtype: int64


In [208]:
# fill
dangjin_merged = dangjin_merged.fillna(method='bfill')
ulsan_merged = ulsan_merged.fillna(method='bfill')

In [218]:
with open('pickles/dangjin_merged.pkl','wb') as f:
    pickle.dump(dangjin_merged,f)
with open('pickles/ulsan_merged.pkl','wb') as f:
    pickle.dump(ulsan_merged,f)