# Basic preprocessing

# Table of Contents
- Load data
- Change column name
- Calculate the timestamp being forecasted in `fcst`
- Change column name of obs_data
- Final overview & save in pickle

## Load data

In [43]:
# load data
import pandas as pd
from glob import glob
import pickle

path = '../original_dataset/'

files = sorted(glob(path+'*.csv'))

files

['../original_dataset\\dangjin_fcst_data.csv',
 '../original_dataset\\dangjin_obs_data.csv',
 '../original_dataset\\energy.csv',
 '../original_dataset\\sample_submission.csv',
 '../original_dataset\\site_info.csv',
 '../original_dataset\\ulsan_fcst_data.csv',
 '../original_dataset\\ulsan_obs_data.csv']

In [38]:
dangjin_fcst_data = pd.read_csv(files[0])
dangjin_obs_data = pd.read_csv(files[1])
ulsan_fcst_data = pd.read_csv(files[5])
ulsan_obs_data = pd.read_csv(files[6])

energy = pd.read_csv(files[2])

## Change column name
1. obs_data의 '지점'과 '지점명'은 drop
2. Column name을 fcst_data를 기준으로 맞춘다.

In [7]:
# drop & re-order
dangjin_obs_data = dangjin_obs_data.loc[:,['일시','기온(°C)','습도(%)','풍속(m/s)','풍향(16방위)','전운량(10분위)']]
ulsan_obs_data = ulsan_obs_data.loc[:,['일시','기온(°C)','습도(%)','풍속(m/s)','풍향(16방위)','전운량(10분위)']]

In [8]:
# change column names
new_column_names = list(dangjin_fcst_data.columns)
new_column_names.remove('forecast')
new_column_names

dangjin_obs_data.columns = new_column_names
ulsan_obs_data.columns = new_column_names
dangjin_obs_data

Unnamed: 0,Forecast time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 00:00,3.1,96.0,3.6,340.0,
1,2018-03-01 01:00,2.8,97.0,0.7,140.0,
2,2018-03-01 02:00,2.6,95.0,3.2,320.0,
3,2018-03-01 03:00,2.0,97.0,1.9,230.0,
4,2018-03-01 04:00,2.2,97.0,2.1,180.0,
...,...,...,...,...,...,...
25621,2021-01-31 19:00,6.7,75.0,1.5,200.0,8.0
25622,2021-01-31 20:00,6.2,77.0,0.8,200.0,8.0
25623,2021-01-31 21:00,5.3,82.0,0.7,230.0,8.0
25624,2021-01-31 22:00,5.8,77.0,1.0,200.0,8.0


## Calculate the timestamp being forecasted in `fcst`
: fcst_data에서 Forecast_time과 forecast를 더해 예측 대상이 되는 timestamp를 새로운 column으로  

1. Convert dtype to pd.datetime
2. Map a timestampe-adding function to the timestamp column

In [15]:
# convert dtype to pd.datetime 
dangjin_fcst_data['Forecast time'] = pd.to_datetime(dangjin_fcst_data['Forecast time'])
dangjin_obs_data['Forecast time'] = pd.to_datetime(dangjin_obs_data['Forecast time'])
ulsan_fcst_data['Forecast time'] = pd.to_datetime(ulsan_fcst_data['Forecast time'])
ulsan_obs_data['Forecast time'] = pd.to_datetime(ulsan_obs_data['Forecast time'])

In [16]:
# calculate
def to_date(x):
    return pd.DateOffset(hours=x)

dangjin_fcst_data['time'] = dangjin_fcst_data['Forecast time'] + dangjin_fcst_data['forecast'].map(to_date)
ulsan_fcst_data['time'] = ulsan_fcst_data['Forecast time'] + ulsan_fcst_data['forecast'].map(to_date)



In [17]:
# drop the original columns
dangjin_fcst_data = dangjin_fcst_data.iloc[:,2:]
ulsan_fcst_data = ulsan_fcst_data.iloc[:,2:]

In [18]:
# re-order
dangjin_fcst_data = dangjin_fcst_data.iloc[:,[-1,0,1,2,3,4]]
ulsan_fcst_data = ulsan_fcst_data.iloc[:,[-1,0,1,2,3,4]]

In [19]:
dangjin_fcst_data

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 15:00:00,0.0,60.0,7.3,309.0,2.0
1,2018-03-01 18:00:00,-2.0,60.0,7.1,314.0,1.0
2,2018-03-01 21:00:00,-2.0,60.0,6.7,323.0,1.0
3,2018-03-02 00:00:00,-2.0,55.0,6.7,336.0,1.0
4,2018-03-02 03:00:00,-4.0,55.0,5.5,339.0,1.0
...,...,...,...,...,...,...
162203,2021-03-03 12:00:00,7.0,40.0,3.2,187.0,1.0
162204,2021-03-03 15:00:00,8.0,40.0,4.5,217.0,1.0
162205,2021-03-03 18:00:00,5.0,55.0,2.2,210.0,1.0
162206,2021-03-03 21:00:00,1.0,80.0,1.9,164.0,1.0


## Change column name of obs_data
: from 'Forecast time' to 'time'  
This is convenient becuase 'time' is the column name in 'energy.csv'.  

In [20]:
dangjin_obs_data = dangjin_obs_data.rename(columns={'Forecast time':'time'})
ulsan_obs_data = ulsan_obs_data.rename(columns={'Forecast time':'time'})

## Final overview & save in pickle

In [21]:
dangjin_fcst_data

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 15:00:00,0.0,60.0,7.3,309.0,2.0
1,2018-03-01 18:00:00,-2.0,60.0,7.1,314.0,1.0
2,2018-03-01 21:00:00,-2.0,60.0,6.7,323.0,1.0
3,2018-03-02 00:00:00,-2.0,55.0,6.7,336.0,1.0
4,2018-03-02 03:00:00,-4.0,55.0,5.5,339.0,1.0
...,...,...,...,...,...,...
162203,2021-03-03 12:00:00,7.0,40.0,3.2,187.0,1.0
162204,2021-03-03 15:00:00,8.0,40.0,4.5,217.0,1.0
162205,2021-03-03 18:00:00,5.0,55.0,2.2,210.0,1.0
162206,2021-03-03 21:00:00,1.0,80.0,1.9,164.0,1.0


In [22]:
dangjin_obs_data

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 00:00:00,3.1,96.0,3.6,340.0,
1,2018-03-01 01:00:00,2.8,97.0,0.7,140.0,
2,2018-03-01 02:00:00,2.6,95.0,3.2,320.0,
3,2018-03-01 03:00:00,2.0,97.0,1.9,230.0,
4,2018-03-01 04:00:00,2.2,97.0,2.1,180.0,
...,...,...,...,...,...,...
25621,2021-01-31 19:00:00,6.7,75.0,1.5,200.0,8.0
25622,2021-01-31 20:00:00,6.2,77.0,0.8,200.0,8.0
25623,2021-01-31 21:00:00,5.3,82.0,0.7,230.0,8.0
25624,2021-01-31 22:00:00,5.8,77.0,1.0,200.0,8.0


In [23]:
ulsan_fcst_data

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 15:00:00,8.0,20.0,14.0,298.0,2.0
1,2018-03-01 18:00:00,4.0,20.0,4.3,298.0,2.0
2,2018-03-01 21:00:00,3.0,30.0,1.9,309.0,2.0
3,2018-03-02 00:00:00,0.0,40.0,1.5,318.0,2.0
4,2018-03-02 03:00:00,-1.0,45.0,1.8,308.0,2.0
...,...,...,...,...,...,...
162203,2021-03-03 12:00:00,9.0,45.0,3.1,83.0,3.0
162204,2021-03-03 15:00:00,9.0,45.0,3.0,111.0,3.0
162205,2021-03-03 18:00:00,8.0,55.0,2.2,122.0,3.0
162206,2021-03-03 21:00:00,6.0,65.0,0.9,131.0,3.0


In [24]:
ulsan_obs_data

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 00:00:00,8.2,98.0,3.9,340.0,10.0
1,2018-03-01 01:00:00,7.0,97.0,4.1,320.0,10.0
2,2018-03-01 02:00:00,6.5,80.0,5.9,290.0,
3,2018-03-01 03:00:00,6.2,79.0,4.6,320.0,3.0
4,2018-03-01 04:00:00,6.7,73.0,4.5,320.0,1.0
...,...,...,...,...,...,...
25627,2021-01-31 19:00:00,8.8,50.0,2.5,200.0,5.0
25628,2021-01-31 20:00:00,8.7,49.0,3.9,200.0,1.0
25629,2021-01-31 21:00:00,8.4,51.0,2.4,230.0,7.0
25630,2021-01-31 22:00:00,9.4,51.0,3.3,230.0,8.0


In [25]:
# pickle the data
with open('dangjin_fcst_data.pkl','wb') as f:
    pickle.dump(dangjin_fcst_data,f)
with open('dangjin_obs_data.pkl','wb') as f:
    pickle.dump(dangjin_obs_data,f)
with open('ulsan_fcst_data.pkl','wb') as f:
    pickle.dump(ulsan_fcst_data,f)
with open('ulsan_obs_data.pkl','wb') as f:
    pickle.dump(ulsan_obs_data,f)

In [26]:
# how to load pickle (for future works)
with open('dangjin_fcst_data.pkl','rb') as f:
    loaded_dangjin_fcst_data = pickle.load(f)
loaded_dangjin_fcst_data

Unnamed: 0,time,Temperature,Humidity,WindSpeed,WindDirection,Cloud
0,2018-03-01 15:00:00,0.0,60.0,7.3,309.0,2.0
1,2018-03-01 18:00:00,-2.0,60.0,7.1,314.0,1.0
2,2018-03-01 21:00:00,-2.0,60.0,6.7,323.0,1.0
3,2018-03-02 00:00:00,-2.0,55.0,6.7,336.0,1.0
4,2018-03-02 03:00:00,-4.0,55.0,5.5,339.0,1.0
...,...,...,...,...,...,...
162203,2021-03-03 12:00:00,7.0,40.0,3.2,187.0,1.0
162204,2021-03-03 15:00:00,8.0,40.0,4.5,217.0,1.0
162205,2021-03-03 18:00:00,5.0,55.0,2.2,210.0,1.0
162206,2021-03-03 21:00:00,1.0,80.0,1.9,164.0,1.0
