In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

## Load and preprocess train data

In [6]:
# load data
with open('../witt_preprocessing/pickles/dangjin_merged.pkl','rb') as f:
    dangjin_data = pickle.load(f)
with open('../witt_preprocessing/pickles/ulsan_merged.pkl','rb') as f:
    ulsan_data = pickle.load(f)


# preprocess data for modeling

# time as index
dangjin_data.set_index('time', inplace=True)
ulsan_data.set_index('time', inplace=True)

# dangjin - sum target values
dangjin_data['dangjin_sum'] = dangjin_data['dangjin'] + dangjin_data['dangjin_floating'] + dangjin_data['dangjin_warehouse']
dangjin_data.drop(columns=['dangjin','dangjin_floating','dangjin_warehouse'], inplace=True)

# delete rows where target == 0
dangjin_data = dangjin_data.loc[dangjin_data['dangjin_sum'] != 0]
ulsan_data = ulsan_data.loc[ulsan_data['ulsan'] != 0]

## Load and preprocess test data (public LB)

In [40]:
sample_submission = pd.read_csv('../original_dataset/sample_submission.csv')

# 2월 일자만
sample_submission_feb = sample_submission.loc[sample_submission['time'].str.contains('2021-02')]
sample_submission_feb

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,0,0,0,0
1,2021-02-01 02:00:00,0,0,0,0
2,2021-02-01 03:00:00,0,0,0,0
3,2021-02-01 04:00:00,0,0,0,0
4,2021-02-01 05:00:00,0,0,0,0
...,...,...,...,...,...
667,2021-02-28 20:00:00,0,0,0,0
668,2021-02-28 21:00:00,0,0,0,0
669,2021-02-28 22:00:00,0,0,0,0
670,2021-02-28 23:00:00,0,0,0,0


In [8]:
# load
with open('../witt_preprocessing/pickles/dangjin_fcst.pkl','rb') as f:
    dangjin = pickle.load(f)
with open('../witt_preprocessing/pickles/ulsan_fcst.pkl','rb') as f:
    ulsan = pickle.load(f)

# index according to sample submission
start, end  = pd.Timestamp('2021-02-01 01:00:00'), pd.Timestamp('2021-03-01-00:00:00') # end == 2021-02-28 24:00:00

# slice test data
dangjin.set_index('time', inplace=True)
dangjin_feb = dangjin.loc[start:end,:]
print(dangjin_feb.shape)

ulsan.set_index('time', inplace=True)
ulsan_feb = ulsan.loc[start:end,:]
print(ulsan_feb.shape)

(672, 9)
(672, 9)


## Config
### X
- obs보다 fcst가 좋다
- Wind_X와 Wind_Y는 제외하는 것이 좋다
- Temperature는 포함하는 것이 좋다
- Year_sin은 포함하는 것이 좋다
### Hp
- min_samples_leaf = 4

In [57]:
x_cols_obs = ['Temperature_obs', 'Humidity_obs','Cloud_obs','Day_cos','Day_sin','Year_cos','Year_sin']
x_cols_fcst = ['Temperature_fcst','Humidity_fcst','Cloud_fcst','Day_cos','Day_sin','Year_cos','Year_sin']

y_dangjin_cols = ['dangjin_sum']
y_ulsan_cols = ['ulsan']

RS = 518

## for public LB

In [10]:
def predict(model, train_data, x_cols_train, y_cols_train, test_data, x_cols_test):
    # print
    print('='*50)
    print('x_cols_train: ', x_cols_train)
    print('y_cols_train: ', y_cols_train)
    print('x_cols_test: ', x_cols_test)
    print('='*50)

    # input-target split
    x = train_data.loc[:,x_cols_train]
    y = train_data.loc[:,y_cols_train]

    # fit
    model.fit(x,y)

    # predict
    test_x = test_data.loc[:,x_cols_test]
    predict = model.predict(test_x)

    return predict, model

In [11]:
print(dangjin_feb.columns)
print(ulsan_feb.columns)

x_cols_feb = ['Temperature', 'Humidity', 'Cloud','Day_cos', 'Day_sin', 'Year_cos', 'Year_sin']

Index(['Temperature', 'Humidity', 'Cloud', 'Wind_X', 'Wind_Y', 'Day_cos',
       'Day_sin', 'Year_cos', 'Year_sin'],
      dtype='object')
Index(['Temperature', 'Humidity', 'Cloud', 'Wind_X', 'Wind_Y', 'Day_cos',
       'Day_sin', 'Year_cos', 'Year_sin'],
      dtype='object')


In [48]:
model = RandomForestRegressor(criterion='mae', verbose=0, n_jobs=-1, random_state=RS, n_estimators=100, min_samples_leaf=4)

dangjin_predict, dangjin_model = predict(model, dangjin_data, x_cols_fcst, y_dangjin_cols, dangjin_feb, x_cols_feb)

x_cols_train:  ['Temperature_fcst', 'Humidity_fcst', 'Cloud_fcst', 'Day_cos', 'Day_sin', 'Year_cos', 'Year_sin']
y_cols_train:  ['dangjin_sum']
x_cols_test:  ['Temperature', 'Humidity', 'Cloud', 'Day_cos', 'Day_sin', 'Year_cos', 'Year_sin']
  model.fit(x,y)


In [49]:
model = RandomForestRegressor(criterion='mae', verbose=0, n_jobs=-1, random_state=RS, n_estimators=100, min_samples_leaf=4)

ulsan_predict, ulsan_model = predict(model, ulsan_data, x_cols_fcst, y_ulsan_cols, ulsan_feb, x_cols_feb)

x_cols_train:  ['Temperature_fcst', 'Humidity_fcst', 'Cloud_fcst', 'Day_cos', 'Day_sin', 'Year_cos', 'Year_sin']
y_cols_train:  ['ulsan']
x_cols_test:  ['Temperature', 'Humidity', 'Cloud', 'Day_cos', 'Day_sin', 'Year_cos', 'Year_sin']
  model.fit(x,y)


## sample_submission.csv

In [50]:
def to_submission(dangjin_predict, ulsan_predict):
    # sum (doesn't matter)
    predict = dangjin_predict + ulsan_predict
    # add
    sample_submission.iloc[:predict.shape[0],1] = predict

    return sample_submission

In [None]:
def below_320(df):
    # function to apply
    def f(x):
        if x<320:
            return 320
        else:
            return x
    df['dangjin_floating'] = df['dangjin_floating'].apply(lambda x:f(x))
    return df

In [56]:
# below_320
submission_org = to_submission(dangjin_predict, ulsan_predict)
submission_320 = below_320(submission_org)

# to csv
PATH = 'rf_min-leaf-4_320_ver2.csv'
submission_320.to_csv(PATH, index=False)

# check
pd.read_csv(PATH)

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2021-02-01 01:00:00,320.0,0,0,0
1,2021-02-01 02:00:00,320.0,0,0,0
2,2021-02-01 03:00:00,320.0,0,0,0
3,2021-02-01 04:00:00,320.0,0,0,0
4,2021-02-01 05:00:00,320.0,0,0,0
...,...,...,...,...,...
1387,2021-07-08 20:00:00,320.0,0,0,0
1388,2021-07-08 21:00:00,320.0,0,0,0
1389,2021-07-08 22:00:00,320.0,0,0,0
1390,2021-07-08 23:00:00,320.0,0,0,0


## Playing with already-submitted data

In [65]:
k = pd.read_csv('rf_min-leaf-4_320.csv')

In [67]:
def add_20(df):
    # function to apply
    def f(x):
        if x>320:
            return x+20
        else:
            return x
    df['dangjin_floating'] = df['dangjin_floating'].apply(lambda x:f(x))
    return df

In [69]:
add_100(k).to_csv('rf_min-leaf-4_320_add20.csv',index=False)