In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib
import copy
import warnings
warnings.filterwarnings('ignore')

In [56]:
total = pd.read_csv('./make_data/1_final_train.csv')
test = pd.read_csv('./make_data/2_final_test.csv') 
submission = pd.read_csv('data/sample_submission.csv')
weather_10 = pd.read_csv("./make_data/0D_weather_10_float.csv")
ensemble = copy.deepcopy(submission)

In [57]:
def gas_nmae(true_df, pred_df):
    true = true_df.iloc[:,0].to_numpy()
    pred = pred_df.iloc[:,0].to_numpy()
    score = np.mean((np.abs(true-pred))/true)
    return score

### 1,2,3월만 학습에 사용하기 위해 추출

In [58]:
total = total[(total.month==1)|(total.month==2)|(total.month==3)]

### 업체별 train, test 세트를 리스트로 만들기

In [59]:
train_list=[]
for chain in range(7):
    train0 = total[total.s_chain==chain]
    test0 = test[test.s_chain==chain]
    tlist = [train0, test0]
    train_list.append(tlist)    

### LGBM 모델

In [60]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]
idx = 0
sco_list = []

for train_df, test_df in train_list:
    
    train = train_df[train_df['year'].isin(train_years)]
    val = train_df[train_df['year'].isin(val_years)]

    features = ['time','weekday', 'no_working', 'temp', 'lunMonth', 'lunDay', 'month', '일사']
    
    train_x = train[features]
    train_y = train['s_qty']
    val_x = val[features]
    val_y = val['s_qty']
    test_x = test_df[features]

    train_x = train_x.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)
    val_x = val_x.reset_index(drop=True)
    val_y = val_y.reset_index(drop=True)
    test_x = test_x.reset_index(drop=True)
    
    d_train = lgb.Dataset(train_x, train_y)
    d_val = lgb.Dataset(val_x, val_y)

    params = {
        'objective': 'regression',
        'metric':'mae',
        'max_depth' : 10,
        'seed':42
    }

    model = lgb.train(params, d_train, 500, d_val, verbose_eval=50, early_stopping_rounds=10)
    joblib.dump(model, f'./models/lgbm_model_{idx}.pkl')
    pred2 = model.predict(val_x)
    pred_df = pd.DataFrame(pred2)
    valy_df = pd.DataFrame(val_y)
    nmae_score = gas_nmae(valy_df, pred_df)
    sco_list.append(nmae_score)
    preds = model.predict(test_x)
    submission.iloc[0+(2160*idx):2160+(2160*idx), 1] = preds
    idx += 1
    
print(sco_list)
print('score =', np.array(sco_list).mean())

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 10824, number of used features: 8
[LightGBM] [Info] Start training from score 1922.714531
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l1: 133.513
Early stopping, best iteration is:
[62]	valid_0's l1: 132.282
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 10824, number of used features: 8
[LightGBM] [Info] Start training from score 1652.850405
Training until validation scores don't improve for 10 rounds
[50]	valid_0's l1: 141.4
Early stopping, best iteration is:
[61]	valid_0's l1: 139.574
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can s

In [61]:
ensemble['lgbm'] = submission['공급량']

### XGB 모델

In [62]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]
idx = 0
sco_list = []

for train_df, test_df in train_list:
    
    train = train_df[train_df['year'].isin(train_years)]
    val = train_df[train_df['year'].isin(val_years)]

    features = ['time','weekday', 'no_working', 'temp', 'lunMonth', 'lunDay', 'month', '일사']

    train_x = train[features]
    train_y = train['s_qty']
    val_x = val[features]
    val_y = val['s_qty']
    test_x = test_df[features]

    train_x = train_x.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)
    val_x = val_x.reset_index(drop=True)
    val_y = val_y.reset_index(drop=True)
    test_x = test_x.reset_index(drop=True)

    dtrain = xgb.DMatrix(data=train_x, label = train_y)
    dval = xgb.DMatrix(data=val_x, label = val_y)
    wlist = [(dtrain, 'train'), (dval,'eval')]

    params = {
        'learning_rate': 0.05, 
        'objective': 'reg:squarederror',
        'metric':'mae', 
        'seed':42
    }

    model = xgb.train( params, dtrain, 2000, evals=wlist, verbose_eval=200, early_stopping_rounds=100)
    joblib.dump(model, f'./models/xgb_model_{idx}.pkl')
    val_x2 = xgb.DMatrix(val_x)
    pred2 = model.predict(val_x2)
    pred_df = pd.DataFrame(pred2)
    valy_df = pd.DataFrame(val_y)
    nmae_score = gas_nmae(valy_df, pred_df)
    sco_list.append(nmae_score)
    test_x = xgb.DMatrix(test_x)
    preds = model.predict(test_x)
    submission.iloc[0+(2160*idx):2160+(2160*idx), 1] = preds
    idx += 1
    
print(sco_list)
print(np.array(sco_list).mean())

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:1908.75781	eval-rmse:2056.78516
[200]	train-rmse:92.51829	eval-rmse:182.37749
[229]	train-rmse:88.49469	eval-rmse:183.29411
Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:1642.56238	eval-rmse:1831.23413
[200]	train-rmse:75.11555	eval-rmse:193.22826
[255]	train-rmse:69.36401	eval-rmse:195.57648
Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost c

In [63]:
ensemble['xgb'] = submission['공급량']

### ExtraTree 모델

In [64]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]
idx = 0
sco_list = []

for train_df, test_df in train_list:
    
    train = train_df[train_df['year'].isin(train_years)]
    val = train_df[train_df['year'].isin(val_years)]

    features = ['time','weekday', 'no_working', 'temp', 'lunMonth', 'lunDay', 'month', '일사']

    train_x = train[features]
    train_y = train['s_qty']
    val_x = val[features]
    val_y = val['s_qty']
    test_x = test_df[features]

    train_x = train_x.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)
    val_x = val_x.reset_index(drop=True)
    val_y = val_y.reset_index(drop=True)
    test_x = test_x.reset_index(drop=True)

    model = ExtraTreesRegressor(max_depth=12, n_estimators=130)
    model.fit(train_x, train_y)
    joblib.dump(model, f'./models/ET_model_{idx}.pkl')
    pred2 = model.predict(val_x)

    pred_df = pd.DataFrame(pred2)
    valy_df = pd.DataFrame(val_y)
    nmae_score = gas_nmae(valy_df, pred_df)
    sco_list.append(nmae_score)
    preds = model.predict(test_x)
    print(type(preds))
    submission.iloc[0+(2160*idx):2160+(2160*idx), 1] = preds
    idx += 1
    
print(sco_list)
print(np.array(sco_list).mean())

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.07234832780755159, 0.07550939226977922, 0.09328408451340006, 0.06844908710062438, 0.08223177774072636, 0.0966670699094651, 0.07023285789493533]
0.07981751389092599


In [65]:
ensemble['extree'] = submission['공급량']

### RandomForest 모델

In [66]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]
idx = 0
sco_list = []

for train_df, test_df in train_list:
    
    train = train_df[train_df['year'].isin(train_years)]
    val = train_df[train_df['year'].isin(val_years)]

    features = ['time','weekday', 'no_working', 'temp', 'lunMonth', 'lunDay', 'month', '일사']

    train_x = train[features]
    train_y = train['s_qty']
    val_x = val[features]
    val_y = val['s_qty']
    test_x = test_df[features]

    train_x = train_x.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)
    val_x = val_x.reset_index(drop=True)
    val_y = val_y.reset_index(drop=True)
    test_x = test_x.reset_index(drop=True)
    
    model = RandomForestRegressor(max_depth=12, n_estimators=100, random_state=0)
    model.fit(train_x, train_y)
    joblib.dump(model, f'./models/rf_model_{idx}.pkl')
    pred2 = model.predict(val_x)

    pred_df = pd.DataFrame(pred2)
    valy_df = pd.DataFrame(val_y)
    nmae_score = gas_nmae(valy_df, pred_df)
    sco_list.append(nmae_score)
    preds = model.predict(test_x)
    submission.iloc[0+(2160*idx):2160+(2160*idx), 1] = preds
    idx += 1
    
print(sco_list)
print(np.array(sco_list).mean())

[0.07128998129225687, 0.079127065062904, 0.09457833951692249, 0.07217597776345915, 0.08651107790749729, 0.09777222363860316, 0.07212822211155143]
0.08194041247045634


In [67]:
ensemble['rf'] = submission['공급량']

### 모델 결과 조합

In [68]:
submission['공급량'] = (ensemble['extree'] + ensemble['rf'] + ensemble['lgbm'] + ensemble['xgb']) / 4

In [69]:
submission

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,2318.512307
1,2019-01-01 02 A,2019.532236
2,2019-01-01 03 A,2003.164477
3,2019-01-01 04 A,2053.181453
4,2019-01-01 05 A,2115.151596
...,...,...
15115,2019-03-31 20 H,392.147604
15116,2019-03-31 21 H,392.665583
15117,2019-03-31 22 H,372.319881
15118,2019-03-31 23 H,320.386734


In [70]:
submission.to_csv('./submit/3_submission_ensemble.csv', index=False)