In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor   # 앙상블(의사결정트리 확장판)
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb
import lightgbm as lgb

In [2]:
train = pd.read_csv("../CSV/train_data.csv")
test = pd.read_csv("../CSV/pred_test.csv")

print("train shape : {}, test shape : {}".format(train.shape, test.shape))

train shape : (368088, 12), test shape : (15120, 12)


In [3]:
train.head(3)

Unnamed: 0,연월일,시간,일시,year,month,day,hour,weekday,구분,구분_int,공급량,기온(°C)
0,2013-01-01,1,2013-01-01 00:00:00,2013,1,1,0,1,A,0,2497.129,-8.3
1,2013-01-01,2,2013-01-01 01:00:00,2013,1,1,1,1,A,0,2363.265,-8.5
2,2013-01-01,3,2013-01-01 02:00:00,2013,1,1,2,1,A,0,2258.505,-8.4


### 다양한 모델 비교
* LinearRegression
* DecisionTreeRegressor
* RandomForestRegressor
* GradientBoostingRegressor

In [4]:
def nmae(true_df, pred_df):
    target_idx = true_df.iloc[:,0]
    pred_df = pred_df[pred_df.iloc[:,0].isin(target_idx)]
    pred_df = pred_df.sort_values(by=[pred_df.columns[0]], ascending=[True])
    true_df = true_df.sort_values(by=[true_df.columns[0]], ascending=[True])
    
    true = true_df.iloc[:,1].to_numpy()
    pred = pred_df.iloc[:,1].to_numpy()
    
    score = np.mean((np.abs(true-pred))/true)
    
    return score

In [5]:
start_time = time.time()

X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

model_list = ["RandomForestRegressor", 
              "GradientBoostingRegressor", 
              "xgboost", 
              "lightgbm"]

models = [RandomForestRegressor(n_jobs = -1, random_state = 37), 
         GradientBoostingRegressor(random_state = 37), 
         xgb.XGBRegressor(), 
         lgb.LGBMRegressor()]

test_size = []
train_score = []
test_score = []
MAE = []
MSE = []
RMSE = []
NMAE = []
model_name = []
    
for idx, model in enumerate(models) :
    for i in range(1, 10, 1) :
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                           test_size = i / 10,
                                                           random_state = 77)
        print("model :", model_list[idx])
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        true_y = pd.DataFrame({"true_y" : y_test})
        pred_y = pd.DataFrame({"pred_y" : pred})
        
        true_y.reset_index(drop = True, inplace = True)
        true_y.reset_index(inplace = True)
        pred_y.reset_index(inplace = True)

        # 결정계수 확인
        print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
        print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
        print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

        # 평가 지표
        print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
        print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
        print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
        print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
        print()
        test_size.append(i)
        train_score.append(model.score(X_train, y_train))
        test_score.append(model.score(X_test, y_test))
        MAE.append(mean_absolute_error(y_test, pred))
        MSE.append(mean_squared_error(y_test, pred))
        RMSE.append(np.sqrt(mean_squared_error(y_test, pred)))
        NMAE.append(nmae(true_y, pred_y))
        model_name.append(model_list[idx])
        
print("실행 시간 : {:.3f}".format(time.time() - start_time))

model : RandomForestRegressor
학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.991
MAE : 43.284478
MSE : 8134.913455
RMSE : 90.193755
NMAE : 0.065845

model : RandomForestRegressor
학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.991
MAE : 44.606335
MSE : 7742.173601
RMSE : 87.989622
NMAE : 0.073360

model : RandomForestRegressor
학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.990
MAE : 46.479649
MSE : 8875.827463
RMSE : 94.211610
NMAE : 0.081462

model : RandomForestRegressor
학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.989
MAE : 48.295092
MSE : 9712.169562
RMSE : 98.550340
NMAE : 0.085867

model : RandomForestRegressor
학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.998
테스트 데이터 결정계수: 0.988
MAE : 50.271398
MSE : 10040.060500
RMSE : 100.200102
NMAE : 0.093086

model : RandomForestRegressor
학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.998
테스트 데이터 결정계수: 0.987
MAE : 53.116626
MSE : 10797.856883
RMSE : 103.912737
NMAE : 0.107326

model : RandomForestRegressor
학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수

In [7]:
print("test_size length : ", len(test_size))
print("train_score length :", len(train_score))
print("test_score length :", len(test_score))
print("MAE length :", len(MAE))
print("MSE length :", len(MSE))
print("RMSE length :", len(RMSE))
print("NMAE length :", len(NMAE))
print("model_name length :", len(model_name))

data_dict = {"model" : model_name, "test_size" : test_size, "train_score" : train_score, 
            "test_score" : test_score, "MAE" : MAE, "MSE" : MSE, "RMSE" : RMSE, "NMAE" : NMAE}
df = pd.DataFrame(data_dict)

df["train_score"] = df["train_score"] * 100
df["test_score"] = df["test_score"] * 100
df

test_size length :  36
train_score length : 36
test_score length : 36
MAE length : 36
MSE length : 36
RMSE length : 36
NMAE length : 36
model_name length : 36


Unnamed: 0,model,test_size,train_score,test_score,MAE,MSE,RMSE,NMAE
0,RandomForestRegressor,1,99.872255,99.064258,43.284478,8134.913455,90.193755,0.065845
1,RandomForestRegressor,2,99.865293,99.101323,44.606335,7742.173601,87.989622,0.07336
2,RandomForestRegressor,3,99.860587,98.968354,46.479649,8875.827463,94.21161,0.081462
3,RandomForestRegressor,4,99.860972,98.869149,48.295092,9712.169562,98.55034,0.085867
4,RandomForestRegressor,5,99.849374,98.826529,50.271398,10040.0605,100.200102,0.093086
5,RandomForestRegressor,6,99.833871,98.738483,53.116626,10797.856883,103.912737,0.107326
6,RandomForestRegressor,7,99.812347,98.641897,56.475974,11644.63174,107.910295,0.128491
7,RandomForestRegressor,8,99.785148,98.479088,61.081844,13066.034828,114.306758,0.158208
8,RandomForestRegressor,9,99.738458,98.157422,69.026887,15830.116293,125.81779,0.230166
9,GradientBoostingRegressor,1,95.390802,95.308636,142.563236,40784.580645,201.951927,0.978683


### 최종 모델 선택

In [12]:
X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

i = 2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = i / 10,
                                                   random_state = 77)

model = lgb.LGBMRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)

true_y = pd.DataFrame({"true_y" : y_test})
pred_y = pd.DataFrame({"pred_y" : pred})

true_y.reset_index(drop = True, inplace = True)
true_y.reset_index(inplace = True)
pred_y.reset_index(inplace = True)
        
# 결정계수 확인
print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

# 평가 지표
print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
    
# model.feature_importances_

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.991
MAE : 44.61232698225977
MSE : 7742.093434334801
RMSE :  87.9891665737027
NMAE :  0.07338350169713619


In [13]:
submission = pd.read_csv('../CSV/가스공급량 수요예측 모델개발 data/sample_submission.csv')

In [14]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0


In [15]:
test_x = test[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
pred = model.predict(test_x)
submission["공급량"] = pred

In [16]:
pred

array([1860.31467, 1747.67289, 1539.22908, ...,  255.04951,  202.48655,
        196.61685])

In [17]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1860.31467
1,2019-01-01 02 A,1747.67289
2,2019-01-01 03 A,1539.22908


In [18]:
submission.tail(3)

Unnamed: 0,일자|시간|구분,공급량
15117,2019-03-31 22 H,255.04951
15118,2019-03-31 23 H,202.48655
15119,2019-03-31 24 H,196.61685


In [19]:
submission.to_csv("../CSV/Submission/05_7_columns_RandomForest_model.csv", index = False)

In [26]:
start_time = time.time()

X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

test_size = []
train_score = []
test_score = []
MAE = []
MSE = []
RMSE = []
NMAE = []
model_name = []

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)
    d_train = lgb.Dataset(X_train, y_train)
    d_val = lgb.Dataset(X_test, y_test)
    
    params = {}
    params["objective"] = "regression"
#     params["boosting_type"] = "dart"
    params["metric"] = "mse"
    params["seed"] = 42
    params["learning_rate"] = 0.1
#     params['sub_feature'] = 0.5
#     params['num_leaves'] = 10
#     params['min_data'] = 50
    params['max_depth'] = 16

    model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)
#     model.fit(X_train, y_train)
    pred = model.predict(X_test)

    true_y = pd.DataFrame({"true_y" : y_test})
    pred_y = pd.DataFrame({"pred_y" : pred})

    true_y.reset_index(drop = True, inplace = True)
    true_y.reset_index(inplace = True)
    pred_y.reset_index(inplace = True)

    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
#     print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
#     print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

    # 평가 지표
    print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
    print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
    print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
    print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
    print()
    test_size.append(i)
#     train_score.append(model.score(X_train, y_train))
#     test_score.append(model.score(X_test, y_test))
    MAE.append(mean_absolute_error(y_test, pred))
    MSE.append(mean_squared_error(y_test, pred))
    RMSE.append(np.sqrt(mean_squared_error(y_test, pred)))
    NMAE.append(nmae(true_y, pred_y))
        
print("실행 시간 : {:.3f}".format(time.time() - start_time))



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 331279, number of used features: 7
[LightGBM] [Info] Start training from score 948.254497
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 51665
[40]	valid_0's l2: 22361.8
[60]	valid_0's l2: 16392.5
[80]	valid_0's l2: 13860.2
[100]	valid_0's l2: 12206
[120]	valid_0's l2: 11228.7
[140]	valid_0's l2: 10467.8
[160]	valid_0's l2: 9886.92
[180]	valid_0's l2: 9429.2
[200]	valid_0's l2: 8996.86
[220]	valid_0's l2: 8603.59
[240]	valid_0's l2: 8267.23
[260]	valid_0's l2: 8017.32
[280]	valid_0's l2: 7791.48
[300]	valid_0's l2: 7574.43
[320]	valid_0's l2: 7385.12
[340]	valid_0's l2: 7219.2
[360]	valid_0's l2: 7091.92
[380]	valid_0's l2: 6934.76
[400]	valid_0's l2: 6804.53
[420]	valid_0's l2: 6683.23
[440]	valid_0's l2: 6559.25
[460]	valid_0's l2: 6445.02
[4



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 294470, number of used features: 7
[LightGBM] [Info] Start training from score 948.760698
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50340.9
[40]	valid_0's l2: 21340.7
[60]	valid_0's l2: 15430.4
[80]	valid_0's l2: 12916.7
[100]	valid_0's l2: 11424.8
[120]	valid_0's l2: 10475.6
[140]	valid_0's l2: 9699.29
[160]	valid_0's l2: 9115.71
[180]	valid_0's l2: 8609.6
[200]	valid_0's l2: 8245.04
[220]	valid_0's l2: 7901.13
[240]	valid_0's l2: 7536.33
[260]	valid_0's l2: 7270.21
[280]	valid_0's l2: 7059.83
[300]	valid_0's l2: 6839.24
[320]	valid_0's l2: 6613.76
[340]	valid_0's l2: 6478.04
[360]	valid_0's l2: 6302.81
[380]	valid_0's l2: 6159.3
[400]	valid_0's l2: 6030.66
[420]	valid_0's l2: 5893.22
[440]	valid_0's l2: 5774.1
[460]	valid_0's l2: 5676.31



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 257661, number of used features: 7
[LightGBM] [Info] Start training from score 948.088488
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50842.7
[40]	valid_0's l2: 22052.3
[60]	valid_0's l2: 16218.1
[80]	valid_0's l2: 13703.4
[100]	valid_0's l2: 12231.5
[120]	valid_0's l2: 11195.1
[140]	valid_0's l2: 10419.4
[160]	valid_0's l2: 9812.36
[180]	valid_0's l2: 9308.03
[200]	valid_0's l2: 8885.39
[220]	valid_0's l2: 8563.8
[240]	valid_0's l2: 8254.15
[260]	valid_0's l2: 7994.38
[280]	valid_0's l2: 7748.73
[300]	valid_0's l2: 7489.75
[320]	valid_0's l2: 7317.07
[340]	valid_0's l2: 7112.84
[360]	valid_0's l2: 6952.63
[380]	valid_0's l2: 6787.77
[400]	valid_0's l2: 6653.59
[420]	valid_0's l2: 6536.14
[440]	valid_0's l2: 6413.48
[460]	valid_0's l2: 6290.



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 220852, number of used features: 7
[LightGBM] [Info] Start training from score 948.476174
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50659.2
[40]	valid_0's l2: 22313.9
[60]	valid_0's l2: 16217.6
[80]	valid_0's l2: 13761.1
[100]	valid_0's l2: 12275.7
[120]	valid_0's l2: 11269.3
[140]	valid_0's l2: 10529.6
[160]	valid_0's l2: 9980.07
[180]	valid_0's l2: 9490.23
[200]	valid_0's l2: 9083.26
[220]	valid_0's l2: 8775.62
[240]	valid_0's l2: 8483.78
[260]	valid_0's l2: 8181.16
[280]	valid_0's l2: 7944.48
[300]	valid_0's l2: 7717.8
[320]	valid_0's l2: 7547.08
[340]	valid_0's l2: 7328.39
[360]	valid_0's l2: 7174.37
[380]	valid_0's l2: 7009.91
[400]	valid_0's l2: 6877.24
[420]	valid_0's l2: 6738.98
[440]	valid_0's l2: 6597.43
[460]	valid_0's l2: 6473.



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 184044, number of used features: 7
[LightGBM] [Info] Start training from score 949.583391
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50058.4
[40]	valid_0's l2: 21970.7
[60]	valid_0's l2: 15950.7
[80]	valid_0's l2: 13589.9
[100]	valid_0's l2: 12126.4
[120]	valid_0's l2: 11183
[140]	valid_0's l2: 10454.3
[160]	valid_0's l2: 9925.73
[180]	valid_0's l2: 9479.18
[200]	valid_0's l2: 9053.65
[220]	valid_0's l2: 8712.34
[240]	valid_0's l2: 8393.86
[260]	valid_0's l2: 8127.9
[280]	valid_0's l2: 7909.88
[300]	valid_0's l2: 7691.34
[320]	valid_0's l2: 7506.62
[340]	valid_0's l2: 7318.31
[360]	valid_0's l2: 7136.1
[380]	valid_0's l2: 6969.34
[400]	valid_0's l2: 6837.68
[420]	valid_0's l2: 6673.1
[440]	valid_0's l2: 6564.22
[460]	valid_0's l2: 6453.42
[



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 147235, number of used features: 7
[LightGBM] [Info] Start training from score 949.899094
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50386.3
[40]	valid_0's l2: 21558.5
[60]	valid_0's l2: 15717.3
[80]	valid_0's l2: 13333.1
[100]	valid_0's l2: 11830.9
[120]	valid_0's l2: 10834.1
[140]	valid_0's l2: 10108.7
[160]	valid_0's l2: 9529.36
[180]	valid_0's l2: 9089.77
[200]	valid_0's l2: 8717.43
[220]	valid_0's l2: 8387.24
[240]	valid_0's l2: 8092
[260]	valid_0's l2: 7826.7
[280]	valid_0's l2: 7620.33
[300]	valid_0's l2: 7452.49
[320]	valid_0's l2: 7272.43
[340]	valid_0's l2: 7106.15
[360]	valid_0's l2: 6940.91
[380]	valid_0's l2: 6790.92
[400]	valid_0's l2: 6655.72
[420]	valid_0's l2: 6485.47
[440]	valid_0's l2: 6373.62
[460]	valid_0's l2: 6272.82




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 110426, number of used features: 7
[LightGBM] [Info] Start training from score 950.061737
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50123.9
[40]	valid_0's l2: 21189.9
[60]	valid_0's l2: 15606.5
[80]	valid_0's l2: 13333.6
[100]	valid_0's l2: 11855.3
[120]	valid_0's l2: 10901.7
[140]	valid_0's l2: 10161.1
[160]	valid_0's l2: 9606.06
[180]	valid_0's l2: 9157.86
[200]	valid_0's l2: 8770.7
[220]	valid_0's l2: 8451.41
[240]	valid_0's l2: 8138.47
[260]	valid_0's l2: 7859.44
[280]	valid_0's l2: 7653.57
[300]	valid_0's l2: 7504.41
[320]	valid_0's l2: 7333.16
[340]	valid_0's l2: 7168.48
[360]	valid_0's l2: 6997.48
[380]	valid_0's l2: 6841.67
[400]	valid_0's l2: 6711.38
[420]	valid_0's l2: 6581.16
[440]	valid_0's l2: 6464.06
[460]	valid_0's l2: 6326.



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 73617, number of used features: 7
[LightGBM] [Info] Start training from score 949.219674
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50173.6
[40]	valid_0's l2: 21556.8
[60]	valid_0's l2: 15594.9
[80]	valid_0's l2: 13269
[100]	valid_0's l2: 11850
[120]	valid_0's l2: 10865.8
[140]	valid_0's l2: 10197.2
[160]	valid_0's l2: 9601.95
[180]	valid_0's l2: 9154.43
[200]	valid_0's l2: 8773.43
[220]	valid_0's l2: 8472.06
[240]	valid_0's l2: 8215.12
[260]	valid_0's l2: 7990.54
[280]	valid_0's l2: 7757.44
[300]	valid_0's l2: 7551.93
[320]	valid_0's l2: 7378
[340]	valid_0's l2: 7218.53
[360]	valid_0's l2: 7044.89
[380]	valid_0's l2: 6905.82
[400]	valid_0's l2: 6775
[420]	valid_0's l2: 6669.59
[440]	valid_0's l2: 6552.99
[460]	valid_0's l2: 6444.11
[480]	v



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 36808, number of used features: 7
[LightGBM] [Info] Start training from score 948.988519
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l2: 50263.2
[40]	valid_0's l2: 21584.1
[60]	valid_0's l2: 15944.2
[80]	valid_0's l2: 13660.5
[100]	valid_0's l2: 12358.6
[120]	valid_0's l2: 11466.8
[140]	valid_0's l2: 10777.6
[160]	valid_0's l2: 10222.7
[180]	valid_0's l2: 9788.53
[200]	valid_0's l2: 9421.33
[220]	valid_0's l2: 9168.84
[240]	valid_0's l2: 8888.02
[260]	valid_0's l2: 8649.28
[280]	valid_0's l2: 8489.04
[300]	valid_0's l2: 8322.69
[320]	valid_0's l2: 8137.02
[340]	valid_0's l2: 7988.82
[360]	valid_0's l2: 7828.6
[380]	valid_0's l2: 7698.27
[400]	valid_0's l2: 7590.99
[420]	valid_0's l2: 7489.65
[440]	valid_0's l2: 7382.05
[460]	valid_0's l2: 7268.1

In [27]:
print("test_size length : ", len(test_size))
print("MAE length :", len(MAE))
print("MSE length :", len(MSE))
print("RMSE length :", len(RMSE))
print("NMAE length :", len(NMAE))

test_size length :  9
MAE length : 9
MSE length : 9
RMSE length : 9
NMAE length : 9


In [28]:
data_dict = {"test_size" : test_size, "MAE" : MAE, "MSE" : MSE, "RMSE" : RMSE, "NMAE" : NMAE}
gbdt_df = pd.DataFrame(data_dict)
gbdt_df

Unnamed: 0,test_size,MAE,MSE,RMSE,NMAE
0,1,45.664662,6216.16675,78.842671,0.468694
1,2,45.637288,5494.134233,74.122427,0.462589
2,3,45.427778,6064.459692,77.874641,0.448246
3,4,45.912256,6275.052347,79.215228,0.460506
4,5,45.877144,6224.68251,78.896657,0.445977
5,6,46.073493,6066.369228,77.8869,0.457113
6,7,46.387438,6105.203351,78.135801,0.452099
7,8,46.537146,6231.259308,78.938326,0.449065
8,9,49.596564,7102.24768,84.274834,0.455944


In [21]:
true_y.tail(3)

Unnamed: 0,index,true_y
331277,331277,702.703
331278,331278,578.676
331279,331279,850.685


In [22]:
pred_y.tail(3)

Unnamed: 0,index,pred_y
331277,331277,621.861411
331278,331278,636.752618
331279,331279,830.798025
