In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import pickle

## Load and preprocess train data

In [19]:
# load data
with open('../witt_preprocessing/pickles/dangjin_merged.pkl','rb') as f:
    dangjin_data = pickle.load(f)
with open('../witt_preprocessing/pickles/ulsan_merged.pkl','rb') as f:
    ulsan_data = pickle.load(f)


# preprocess data for modeling

# time as index
dangjin_data.set_index('time', inplace=True)
ulsan_data.set_index('time', inplace=True)

# drop fcst features (one-weather structure)
dangjin_data.drop(columns=['Temperature_fcst', 'Humidity_fcst','Cloud_fcst','Wind_X_fcst','Wind_Y_fcst'], inplace=True)
ulsan_data.drop(columns=['Temperature_fcst', 'Humidity_fcst','Cloud_fcst','Wind_X_fcst','Wind_Y_fcst'], inplace=True)

# dangjin - sum target values
dangjin_data['dangjin_sum'] = dangjin_data['dangjin'] + dangjin_data['dangjin_floating'] + dangjin_data['dangjin_warehouse']
dangjin_data.drop(columns=['dangjin','dangjin_floating','dangjin_warehouse'], inplace=True)

# dropna
dangjin_data.dropna(inplace=True)


display(dangjin_data)
display(ulsan_data)

Unnamed: 0_level_0,Temperature_obs,Humidity_obs,Cloud_obs,Wind_X_obs,Wind_Y_obs,Day_cos,Day_sin,Year_cos,Year_sin,dangjin_sum
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-03-02 00:00:00,-2.000000,50.000000,2.763537,5.298564,-136.663512,1.000000e+00,0.000000,0.999407,0.034422,0.0
2018-03-02 01:00:00,-2.333333,51.666667,2.763537,5.088343,-131.604586,9.659258e-01,0.258819,0.999407,0.034422,0.0
2018-03-02 02:00:00,-2.666667,53.333333,2.763537,4.878123,-126.545661,8.660254e-01,0.500000,0.999407,0.034422,0.0
2018-03-02 03:00:00,-3.000000,55.000000,2.763537,4.667902,-121.486735,7.071068e-01,0.707107,0.999407,0.034422,0.0
2018-03-02 04:00:00,-3.333333,56.666667,2.763537,4.468414,-95.331116,5.000000e-01,0.866025,0.999407,0.034422,0.0
...,...,...,...,...,...,...,...,...,...,...
2021-01-31 18:00:00,6.000000,75.000000,7.961345,-3.374657,-22.789567,-1.836970e-16,-1.000000,0.860961,0.508671,16.0
2021-01-31 19:00:00,6.000000,76.666667,7.961345,-3.521363,-3.549990,2.588190e-01,-0.965926,0.860961,0.508671,0.0
2021-01-31 20:00:00,6.000000,78.333333,7.961345,-3.668069,15.689587,5.000000e-01,-0.866025,0.860961,0.508671,0.0
2021-01-31 21:00:00,6.000000,80.000000,7.961345,-3.814776,34.929164,7.071068e-01,-0.707107,0.860961,0.508671,0.0


Unnamed: 0_level_0,Temperature_obs,Humidity_obs,Cloud_obs,Wind_X_obs,Wind_Y_obs,ulsan,Day_cos,Day_sin,Year_cos,Year_sin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-03-02 00:00:00,0.000000,40.000000,3.591006,1.010290,-202.011846,0,1.000000e+00,0.000000,0.999407,0.034422
2018-03-02 01:00:00,-0.333333,41.666667,3.591006,0.964412,-221.844994,0,9.659258e-01,0.258819,0.999407,0.034422
2018-03-02 02:00:00,-0.666667,43.333333,3.591006,0.918535,-241.678143,0,8.660254e-01,0.500000,0.999407,0.034422
2018-03-02 03:00:00,-1.000000,45.000000,3.591006,0.872657,-261.511292,0,7.071068e-01,0.707107,0.999407,0.034422
2018-03-02 04:00:00,-1.000000,45.000000,2.967691,0.917287,-258.350002,0,5.000000e-01,0.866025,0.999407,0.034422
...,...,...,...,...,...,...,...,...,...,...
2021-01-31 18:00:00,9.000000,55.000000,8.638197,-2.472747,-61.185365,8,-1.836970e-16,-1.000000,0.860961,0.508671
2021-01-31 19:00:00,8.666667,60.000000,8.638197,-2.629993,-75.790243,0,2.588190e-01,-0.965926,0.860961,0.508671
2021-01-31 20:00:00,8.333333,65.000000,8.638197,-2.787240,-90.395122,0,5.000000e-01,-0.866025,0.860961,0.508671
2021-01-31 21:00:00,8.000000,70.000000,8.638197,-2.944486,-105.000000,0,7.071068e-01,-0.707107,0.860961,0.508671


## Load and preprocess test data (public LB)

In [3]:
sample_submission = pd.read_csv('../original_dataset/sample_submission.csv')

# 2월 일자만
sample_submission_feb = sample_submission.loc[sample_submission['time'].str.contains('2021-02')]
sample_submission_feb

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,0,0,0,0
1,2021-02-01 02:00:00,0,0,0,0
2,2021-02-01 03:00:00,0,0,0,0
3,2021-02-01 04:00:00,0,0,0,0
4,2021-02-01 05:00:00,0,0,0,0
...,...,...,...,...,...
667,2021-02-28 20:00:00,0,0,0,0
668,2021-02-28 21:00:00,0,0,0,0
669,2021-02-28 22:00:00,0,0,0,0
670,2021-02-28 23:00:00,0,0,0,0


In [4]:
# load
with open('../witt_preprocessing/pickles/dangjin_fcst.pkl','rb') as f:
    dangjin = pickle.load(f)
with open('../witt_preprocessing/pickles/ulsan_fcst.pkl','rb') as f:
    ulsan = pickle.load(f)

# index according to sample submission
start, end  = pd.Timestamp('2021-02-01 01:00:00'), pd.Timestamp('2021-03-01-00:00:00') # end == 2021-02-28 24:00:00

# slice test data
dangjin.set_index('time', inplace=True)
dangjin_feb = dangjin.loc[start:end,:]
print(dangjin_feb.shape)

ulsan.set_index('time', inplace=True)
ulsan_feb = ulsan.loc[start:end,:]
print(ulsan_feb.shape)

(672, 9)
(672, 9)


## train and predict - dangjin

In [20]:
# input-target split
x_cols = ['Temperature_obs', 'Humidity_obs','Cloud_obs','Wind_X_obs','Wind_Y_obs','Day_cos','Day_sin','Year_cos','Year_sin']
y_cols = ['dangjin_sum']

x = dangjin_data.loc[:,x_cols]
y = dangjin_data.loc[:,y_cols]

# train
model = RandomForestRegressor(criterion='mae', verbose=2, n_jobs=-1)
model.fit(x,y)

# predict
dangjin_predict = model.predict(dangjin_feb)

  model.fit(x,y)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 100building tree 2 of 100building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100


building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.9min
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building 

In [168]:
# input-target split
x_cols = ['Temperature_obs', 'Humidity_obs','Cloud_obs','Wind_X_obs','Wind_Y_obs','Day_cos','Day_sin','Year_cos','Year_sin']
y_cols = ['ulsan']

x = ulsan_data.loc[:,x_cols]
y = ulsan_data.loc[:,y_cols]

# train
model = RandomForestRegressor(criterion='mae', verbose=2, n_jobs=-1)
model.fit(x,y)

# predict
ulsan_predict = model.predict(ulsan_feb)

  model.fit(x,y)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building 

## sample_submission.csv

In [43]:
# sum predict (상관없음; 평가식 참고)
predict = dangjin_predict + ulsan_predict
sample_submission.iloc[:predict.shape[0],1] = predict
sample_submission

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,0.14,0,0,0
1,2021-02-01 02:00:00,0.07,0,0,0
2,2021-02-01 03:00:00,0.00,0,0,0
3,2021-02-01 04:00:00,0.17,0,0,0
4,2021-02-01 05:00:00,0.22,0,0,0
...,...,...,...,...,...
1387,2021-07-08 20:00:00,0.00,0,0,0
1388,2021-07-08 21:00:00,0.00,0,0,0
1389,2021-07-08 22:00:00,0.00,0,0,0
1390,2021-07-08 23:00:00,0.00,0,0,0


In [44]:
sample_submission.to_csv('rf_single-timestep.csv', index=False)
pd.read_csv('rf_single-timestep.csv')

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,0.14,0,0,0
1,2021-02-01 02:00:00,0.07,0,0,0
2,2021-02-01 03:00:00,0.00,0,0,0
3,2021-02-01 04:00:00,0.17,0,0,0
4,2021-02-01 05:00:00,0.22,0,0,0
...,...,...,...,...,...
1387,2021-07-08 20:00:00,0.00,0,0,0
1388,2021-07-08 21:00:00,0.00,0,0,0
1389,2021-07-08 22:00:00,0.00,0,0,0
1390,2021-07-08 23:00:00,0.00,0,0,0
