In [6]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor   # 앙상블(의사결정트리 확장판)
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb
import lightgbm as lgb

In [7]:
train = pd.read_csv("../CSV/train_data.csv")
test = pd.read_csv("../CSV/pred_test.csv")

print("train shape : {}, test shape : {}".format(train.shape, test.shape))

train shape : (368088, 12), test shape : (15120, 12)


In [8]:
train.head(3)

Unnamed: 0,연월일,시간,일시,year,month,day,hour,weekday,구분,구분_int,공급량,기온(°C)
0,2013-01-01,1,2013-01-01 00:00:00,2013,1,1,0,1,A,0,2497.129,-8.3
1,2013-01-01,2,2013-01-01 01:00:00,2013,1,1,1,1,A,0,2363.265,-8.5
2,2013-01-01,3,2013-01-01 02:00:00,2013,1,1,2,1,A,0,2258.505,-8.4


### LinearRegression

In [5]:
X = train[["year", "month", "day", "hour", "weekday", "기온(°C)", "구분_int"]]
y = train["공급량"]

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    pred = lr_model.predict(X_test)

    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
    print("학습용 데이터 결정계수: {:.3f}".format(lr_model.score(X_train, y_train)))
    print("테스트 데이터 결정계수: {:.3f}".format(lr_model.score(X_test, y_test)))
    print("MSE :", mean_squared_error(y_test, pred))
    print()
    
# model.feature_importances_

학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.321
MSE : 590491.4823538994

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.321
MSE : 584549.6191155387

학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.323
MSE : 582575.4475852917

학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.322
MSE : 582469.7143700205

학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.323
테스트 데이터 결정계수: 0.322
MSE : 580411.7422423407

학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.323
테스트 데이터 결정계수: 0.322
MSE : 580421.0644667257

학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.321
테스트 데이터 결정계수: 0.322
MSE : 580907.4112167998

학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.321
테스트 데이터 결정계수: 0.322
MSE : 582235.8827764511

학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.324
테스트 데이터 결정계수: 0.322
MSE : 582621.8024223659



### xgboost

In [5]:
X = train[["year", "month", "day", "hour", "weekday", "기온(°C)", "구분_int"]]
y = train["공급량"]

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)

    xgb_model = xgb.XGBRegressor(objective = "reg:linear", 
                            colsample_bytree = 0.3, 
                            learning_rate = 0.1, 
                            max_depth = 3, 
                            alpha = 0.1, 
                            n_estimators = 1000)
    xgb_model.fit(X_train, y_train)
    pred = xgb_model.predict(X_test)

    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
    print("학습용 데이터 결정계수: {:.3f}".format(xgb_model.score(X_train, y_train)))
    print("테스트 데이터 결정계수: {:.3f}".format(xgb_model.score(X_test, y_test)))
    print("MSE :", mean_squared_error(y_test, pred))
    print()

학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.978
테스트 데이터 결정계수: 0.978
MSE : 19429.419025144703

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.978
테스트 데이터 결정계수: 0.978
MSE : 18635.423363707363

학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.979
테스트 데이터 결정계수: 0.978
MSE : 19232.720424837866

학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.979
테스트 데이터 결정계수: 0.978
MSE : 19084.835802899775

학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.978
테스트 데이터 결정계수: 0.977
MSE : 19622.689955968704

학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.979
테스트 데이터 결정계수: 0.978
MSE : 18996.24741908079

학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.979
테스트 데이터 결정계수: 0.978
MSE : 18942.06772576221

학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.979
테스트 데이터 결정계수: 0.978
MSE : 19004.762596107706

학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.979
테스트 데이터 결정계수: 0.977
MSE : 19918.98344049013



### lightgbm

In [10]:
X = train[["year", "month", "day", "hour", "weekday", "기온(°C)", "구분_int"]]
y = train["공급량"]

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)
    
    hyperparameters = {'boosting_type': 'gbdt', 
                       'colsample_bytree': 0.7250136792694301, 
                       'is_unbalance': False, 
                       'learning_rate': 0.013227664889528229,
                       'min_child_samples': 20, 
                       'num_leaves': 56, 
                       'reg_alpha': 0.7543896477745794, 
                       'reg_lambda': 0.07152751159655985, 
                       'subsample_for_bin': 240000, 
                       'subsample': 0.5233384321711397, 
                       'n_estimators': 2000}

    lgb_model = lgb.LGBMRegressor(**hyperparameters)
    lgb_model.fit(X_train, y_train)
    pred = lgb_model.predict(X_test)

    true_y = pd.DataFrame({"true_y" : y_test})
    pred_y = pd.DataFrame({"pred_y" : pred})

    true_y.reset_index(drop = True, inplace = True)
    true_y.reset_index(inplace = True)
    pred_y.reset_index(inplace = True)
    
    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
    print("학습용 데이터 결정계수: {:.3f}".format(lgb_model.score(X_train, y_train)))
    print("테스트 데이터 결정계수: {:.3f}".format(lgb_model.score(X_test, y_test)))
    print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
    print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
    print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
    print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
    print()

학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.993
MAE : 44.807329
MSE : 6206.189685
RMSE : 78.779373
NMAE : 0.462553

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.994
MAE : 44.719306
MSE : 5410.368952
RMSE : 73.555210
NMAE : 0.459494

학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.993
MAE : 45.176230
MSE : 6121.545202
RMSE : 78.240304
NMAE : 0.468146

학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.993
MAE : 45.204995
MSE : 6284.886534
RMSE : 79.277276
NMAE : 0.462256

학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.995
테스트 데이터 결정계수: 0.993
MAE : 45.271973
MSE : 6190.907656
RMSE : 78.682321
NMAE : 0.465022

학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.993
MAE : 45.514722
MSE : 6056.301968
RMSE : 77.822246
NMAE : 0.464575

학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.993
MAE : 46.246795
MSE : 6157.411530
RMSE : 78.469176
NMAE : 0.465726

학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.994
테스트 데이터 결정계수: 0.993
MAE : 47.037478
MSE : 6391.914231
RMSE : 79.949448
NMAE : 0.

### 다양한 모델 비교
* LinearRegression
* DecisionTreeRegressor
* RandomForestRegressor
* GradientBoostingRegressor

In [9]:
def nmae(true_df, pred_df):
    target_idx = true_df.iloc[:,0]
    pred_df = pred_df[pred_df.iloc[:,0].isin(target_idx)]
    pred_df = pred_df.sort_values(by=[pred_df.columns[0]], ascending=[True])
    true_df = true_df.sort_values(by=[true_df.columns[0]], ascending=[True])
    
    true = true_df.iloc[:,1].to_numpy()
    pred = pred_df.iloc[:,1].to_numpy()
    
    score = np.mean((np.abs(true-pred))/true)
    
    return score

In [28]:
start_time = time.time()

X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

model_list = ["LinearRegression", 
              "DecisionTreeRegressor", 
              "RandomForestRegressor", 
              "GradientBoostingRegressor", 
              "xgboost", 
              "lightgbm"]

models = [LinearRegression(), 
         DecisionTreeRegressor(), 
         RandomForestRegressor(n_jobs = -1, random_state = 37), 
         GradientBoostingRegressor(random_state = 37), 
         xgb.XGBRegressor(), 
         lgb.LGBMRegressor()]

test_size = []
train_score = []
test_score = []
MAE = []
MSE = []
RMSE = []
NMAE = []
model_name = []
    
for idx, model in enumerate(models) :
    for i in range(1, 10, 1) :
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                           test_size = i / 10,
                                                           random_state = 77)
        print("model :", model_list[idx])
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        true_y = pd.DataFrame({"true_y" : y_test})
        pred_y = pd.DataFrame({"pred_y" : pred})
        
        true_y.reset_index(drop = True, inplace = True)
        true_y.reset_index(inplace = True)
        pred_y.reset_index(inplace = True)

        # 결정계수 확인
        print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
        print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
        print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

        # 평가 지표
        print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
        print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
        print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
        print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
        print()
        test_size.append(i)
        train_score.append(model.score(X_train, y_train))
        test_score.append(model.score(X_test, y_test))
        MAE.append(mean_absolute_error(y_test, pred))
        MSE.append(mean_squared_error(y_test, pred))
        RMSE.append(np.sqrt(mean_squared_error(y_test, pred)))
        NMAE.append(nmae(true_y, pred_y))
        model_name.append(model_list[idx])
        
print("실행 시간 : {:.3f}".format(time.time() - start_time))

model : LinearRegression
학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.321
MAE : 582.7460159784652
MSE : 590491.4823538994
RMSE :  768.4344359500682
NMAE :  4.306466357254007

model : LinearRegression
학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.321
MAE : 579.5269534039344
MSE : 584549.6191155387
RMSE :  764.5584471546558
NMAE :  4.252175862019568

model : LinearRegression
학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.323
MAE : 578.8488702444839
MSE : 582575.4475852916
RMSE :  763.2663018798168
NMAE :  4.202850858778187

model : LinearRegression
학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.322
MAE : 579.2466688796138
MSE : 582469.7143700205
RMSE :  763.1970350899043
NMAE :  4.2107011724089425

model : LinearRegression
학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.323
테스트 데이터 결정계수: 0.322
MAE : 578.600880127177
MSE : 580411.7422423407
RMSE :  761.8475846534795
NMAE :  4.2241205042263825

model : LinearRegression
학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.323
테스트 데이터 결정계수: 0.

model : xgboost
학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.993
테스트 데이터 결정계수: 0.989
MAE : 60.1376596188379
MSE : 9495.712006448934
RMSE :  97.44594402256531
NMAE :  0.4974624650949261

model : lightgbm
학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.987
테스트 데이터 결정계수: 0.986
MAE : 67.73681829542026
MSE : 12141.193600972847
RMSE :  110.18708454702323
NMAE :  0.5489868400593946

model : lightgbm
학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.987
테스트 데이터 결정계수: 0.987
MAE : 67.46010176734315
MSE : 11436.590272051339
RMSE :  106.94199489466867
NMAE :  0.5415505280333585

model : lightgbm
학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.987
테스트 데이터 결정계수: 0.986
MAE : 67.40910394997995
MSE : 12001.113129950983
RMSE :  109.54959210307898
NMAE :  0.5515241972877374

model : lightgbm
학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.987
테스트 데이터 결정계수: 0.986
MAE : 67.71819700188726
MSE : 12320.997661853695
RMSE :  110.99998946780893
NMAE :  0.5518316101195799

model : lightgbm
학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.987
테스트 데이터 결정계수: 0.986
MAE : 67.8042131965938
MSE : 12126

In [29]:
print("test_size length : ", len(test_size))
print("train_score length :", len(train_score))
print("test_score length :", len(test_score))
print("MAE length :", len(MAE))
print("MSE length :", len(MSE))
print("RMSE length :", len(RMSE))
print("NMAE length :", len(NMAE))
print("model_name length :", len(model_name))

test_size length :  54
train_score length : 54
test_score length : 54
MAE length : 54
MSE length : 54
RMSE length : 54
NMAE length : 54
model_name length : 54


In [30]:
data_dict = {"model" : model_name, "test_size" : test_size, "train_score" : train_score, 
            "test_score" : test_score, "MAE" : MAE, "MSE" : MSE, "RMSE" : RMSE, "NMAE" : NMAE}
df = pd.DataFrame(data_dict)

In [31]:
df["train_score"] = df["train_score"] * 100
df["test_score"] = df["test_score"] * 100
df

Unnamed: 0,model,test_size,train_score,test_score,MAE,MSE,RMSE,NMAE
0,LinearRegression,1,32.231723,32.077015,582.746016,590491.482354,768.434436,4.306466
1,LinearRegression,2,32.23269,32.148115,579.526953,584549.619116,764.558447,4.252176
2,LinearRegression,3,32.185351,32.286701,578.84887,582575.447585,763.266302,4.202851
3,LinearRegression,4,32.239283,32.179261,579.246669,582469.71437,763.197035,4.210701
4,LinearRegression,5,32.267667,32.162134,578.60088,580411.742242,761.847585,4.224121
5,LinearRegression,6,32.252351,32.189216,578.910066,580421.064467,761.853703,4.192962
6,LinearRegression,7,32.123475,32.249282,579.409683,580907.411217,762.172822,4.189123
7,LinearRegression,8,32.141789,32.226587,580.008349,582235.882776,763.043828,4.17186
8,LinearRegression,9,32.385276,32.184565,580.335669,582621.802422,763.296667,4.149284
9,DecisionTreeRegressor,1,100.0,98.319467,58.284638,14609.784299,120.870941,0.074891


In [9]:
df.describe()

Unnamed: 0,test_size,LinearRegression_score,DecisionTreeRegressor_score,RandomForestRegressor_score,GradientBoostingRegressor_score,LinearRegression_MSE,DecisionTreeRegressor_MSE,RandomForestRegressor_MSE,GradientBoostingRegressor_MSE
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,5.0,0.3179,-0.0137,0.11593,0.376551,586397.063487,871199.0,759936.854768,535972.885018
std,2.738613,0.000718,0.149621,0.037465,0.001061,3229.825906,126802.3,29905.16383,2420.626911
min,1.0,0.316583,-0.250021,0.070965,0.374196,583767.273155,733506.0,706129.411126,533493.526987
25%,3.0,0.317606,-0.122826,0.083518,0.376386,584135.250889,761420.8,740388.189369,534615.396566
50%,5.0,0.317983,0.017262,0.10694,0.376483,585879.844311,840817.5,766993.393836,535276.241963
75%,7.0,0.31853,0.114993,0.140591,0.377345,586188.709652,962731.8,784129.648921,537473.987917
max,9.0,0.318728,0.156264,0.178086,0.377843,594131.699276,1073929.0,795200.338134,541157.497874


### 최종 모델 선택

In [12]:
X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

i = 2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = i / 10,
                                                   random_state = 77)

model = RandomForestRegressor(n_jobs = -1, random_state = 37)
model.fit(X_train, y_train)
pred = model.predict(X_test)

true_y = pd.DataFrame({"true_y" : y_test})
pred_y = pd.DataFrame({"pred_y" : pred})

true_y.reset_index(drop = True, inplace = True)
true_y.reset_index(inplace = True)
pred_y.reset_index(inplace = True)
        
# 결정계수 확인
print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

# 평가 지표
print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
    
# model.feature_importances_

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.991
MAE : 44.61232698225977
MSE : 7742.093434334801
RMSE :  87.9891665737027
NMAE :  0.07338350169713619


In [13]:
submission = pd.read_csv('../CSV/가스공급량 수요예측 모델개발 data/sample_submission.csv')

In [14]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0


In [15]:
test_x = test[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
pred = model.predict(test_x)
submission["공급량"] = pred

In [16]:
pred

array([1860.31467, 1747.67289, 1539.22908, ...,  255.04951,  202.48655,
        196.61685])

In [17]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1860.31467
1,2019-01-01 02 A,1747.67289
2,2019-01-01 03 A,1539.22908


In [18]:
submission.tail(3)

Unnamed: 0,일자|시간|구분,공급량
15117,2019-03-31 22 H,255.04951
15118,2019-03-31 23 H,202.48655
15119,2019-03-31 24 H,196.61685


In [19]:
submission.to_csv("../CSV/Submission/05_7_columns_RandomForest_model.csv", index = False)

In [24]:
start_time = time.time()

X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

test_size = []
train_score = []
test_score = []
MAE = []
MSE = []
RMSE = []
NMAE = []
model_name = []

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)
    d_train = lgb.Dataset(X_train, y_train)
    d_val = lgb.Dataset(X_test, y_test)
    
    params = {}
    params["objective"] = "regression"
    params["boosting_type"] = "dart"
    params["metric"] = "mse"
    params["seed"] = 42
#     params["learning_rate"] = 0.003
#     params['sub_feature'] = 0.5
#     params['num_leaves'] = 10
#     params['min_data'] = 50
#     params['max_depth'] = 10

    model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)
#     model.fit(X_train, y_train)
    pred = model.predict(X_test)

    true_y = pd.DataFrame({"true_y" : y_test})
    pred_y = pd.DataFrame({"pred_y" : pred})

    true_y.reset_index(drop = True, inplace = True)
    true_y.reset_index(inplace = True)
    pred_y.reset_index(inplace = True)

    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
#     print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
#     print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

    # 평가 지표
    print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
    print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
    print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
    print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
    print()
    test_size.append(i)
#     train_score.append(model.score(X_train, y_train))
#     test_score.append(model.score(X_test, y_test))
    MAE.append(mean_absolute_error(y_test, pred))
    MSE.append(mean_squared_error(y_test, pred))
    RMSE.append(np.sqrt(mean_squared_error(y_test, pred)))
    NMAE.append(nmae(true_y, pred_y))
        
print("실행 시간 : {:.3f}".format(time.time() - start_time))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 331279, number of used features: 7
[LightGBM] [Info] Start training from score 948.254497
[20]	valid_0's l2: 111535




[40]	valid_0's l2: 91758.1
[60]	valid_0's l2: 52222
[80]	valid_0's l2: 59121.9
[100]	valid_0's l2: 41333
[120]	valid_0's l2: 52561.7
[140]	valid_0's l2: 50155.5
[160]	valid_0's l2: 44382.5
[180]	valid_0's l2: 30263.4
[200]	valid_0's l2: 34965
[220]	valid_0's l2: 28459.1
[240]	valid_0's l2: 23603.6
[260]	valid_0's l2: 21449.4
[280]	valid_0's l2: 19426.3
[300]	valid_0's l2: 19139.3
[320]	valid_0's l2: 16830.5
[340]	valid_0's l2: 18448.8
[360]	valid_0's l2: 20333.2
[380]	valid_0's l2: 16544.6
[400]	valid_0's l2: 17638.8
[420]	valid_0's l2: 16178.1
[440]	valid_0's l2: 17244.3
[460]	valid_0's l2: 16376.6
[480]	valid_0's l2: 13650.3
[500]	valid_0's l2: 13596.3
학습용 : 9, 테스트용 : 1
MAE : 70.322187
MSE : 13596.295548
RMSE : 116.603154
NMAE : 0.541893

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 294470, number of used features: 7
[



[20]	valid_0's l2: 109752
[40]	valid_0's l2: 90513.2
[60]	valid_0's l2: 51356.3
[80]	valid_0's l2: 58151.8
[100]	valid_0's l2: 40193.1
[120]	valid_0's l2: 51569.8
[140]	valid_0's l2: 49179
[160]	valid_0's l2: 43619
[180]	valid_0's l2: 29569.1
[200]	valid_0's l2: 34004.5
[220]	valid_0's l2: 27728.9
[240]	valid_0's l2: 22906.8
[260]	valid_0's l2: 20652.5
[280]	valid_0's l2: 18739.9
[300]	valid_0's l2: 18400
[320]	valid_0's l2: 16033
[340]	valid_0's l2: 17667.5
[360]	valid_0's l2: 19511.7
[380]	valid_0's l2: 15825.8
[400]	valid_0's l2: 16852.7
[420]	valid_0's l2: 15393.6
[440]	valid_0's l2: 16490.2
[460]	valid_0's l2: 15710.6
[480]	valid_0's l2: 12950.6
[500]	valid_0's l2: 12934.1
학습용 : 8, 테스트용 : 2
MAE : 70.427141
MSE : 12934.106101
RMSE : 113.728212
NMAE : 0.543166

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 257661, numb



[20]	valid_0's l2: 109862
[40]	valid_0's l2: 91056
[60]	valid_0's l2: 51887.5
[80]	valid_0's l2: 58460.8
[100]	valid_0's l2: 40819.8
[120]	valid_0's l2: 52118.3
[140]	valid_0's l2: 49900.5
[160]	valid_0's l2: 44074.7
[180]	valid_0's l2: 30001.3
[200]	valid_0's l2: 34578
[220]	valid_0's l2: 28316.9
[240]	valid_0's l2: 23602.6
[260]	valid_0's l2: 21407.6
[280]	valid_0's l2: 19234
[300]	valid_0's l2: 18931.8
[320]	valid_0's l2: 16645.4
[340]	valid_0's l2: 18253.8
[360]	valid_0's l2: 20213.5
[380]	valid_0's l2: 16388.8
[400]	valid_0's l2: 17385.9
[420]	valid_0's l2: 15932
[440]	valid_0's l2: 17082.8
[460]	valid_0's l2: 16255.8
[480]	valid_0's l2: 13514.1
[500]	valid_0's l2: 13501.2
학습용 : 7, 테스트용 : 3
MAE : 70.598198
MSE : 13501.172650
RMSE : 116.194547
NMAE : 0.555900

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 220852, numb



[20]	valid_0's l2: 109854
[40]	valid_0's l2: 91053.8
[60]	valid_0's l2: 51783.2
[80]	valid_0's l2: 58861.9
[100]	valid_0's l2: 40775.2
[120]	valid_0's l2: 51984.4
[140]	valid_0's l2: 49733.9
[160]	valid_0's l2: 44075.3
[180]	valid_0's l2: 30034.7
[200]	valid_0's l2: 34736.3
[220]	valid_0's l2: 28476.5
[240]	valid_0's l2: 23681.3
[260]	valid_0's l2: 21622.7
[280]	valid_0's l2: 19536.2
[300]	valid_0's l2: 19209.3
[320]	valid_0's l2: 16880.6
[340]	valid_0's l2: 18510.1
[360]	valid_0's l2: 20408.8
[380]	valid_0's l2: 16639.2
[400]	valid_0's l2: 17663.8
[420]	valid_0's l2: 16188.4
[440]	valid_0's l2: 17287.8
[460]	valid_0's l2: 16443.3
[480]	valid_0's l2: 13718.6
[500]	valid_0's l2: 13741.3
학습용 : 6, 테스트용 : 4
MAE : 70.695862
MSE : 13741.292580
RMSE : 117.223260
NMAE : 0.551674

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 1840



[20]	valid_0's l2: 108780
[40]	valid_0's l2: 90148.9
[60]	valid_0's l2: 51277
[80]	valid_0's l2: 58236.2
[100]	valid_0's l2: 40459.7
[120]	valid_0's l2: 51550.2
[140]	valid_0's l2: 49401.2
[160]	valid_0's l2: 43799.3
[180]	valid_0's l2: 29829.1
[200]	valid_0's l2: 34417.7
[220]	valid_0's l2: 28223.5
[240]	valid_0's l2: 23345.7
[260]	valid_0's l2: 21183.7
[280]	valid_0's l2: 19236.4
[300]	valid_0's l2: 18918.6
[320]	valid_0's l2: 16621.1
[340]	valid_0's l2: 18184.4
[360]	valid_0's l2: 20071.9
[380]	valid_0's l2: 16352.1
[400]	valid_0's l2: 17397.8
[420]	valid_0's l2: 16024
[440]	valid_0's l2: 17067.1
[460]	valid_0's l2: 16223.4
[480]	valid_0's l2: 13536.1
[500]	valid_0's l2: 13495.6
학습용 : 5, 테스트용 : 5
MAE : 70.565591
MSE : 13495.633508
RMSE : 116.170708
NMAE : 0.555999

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 147235, 



[20]	valid_0's l2: 109135
[40]	valid_0's l2: 90499.2
[60]	valid_0's l2: 51474.1
[80]	valid_0's l2: 58399.6
[100]	valid_0's l2: 40350.5
[120]	valid_0's l2: 51686.9
[140]	valid_0's l2: 49405.7
[160]	valid_0's l2: 43650.3
[180]	valid_0's l2: 29862.4
[200]	valid_0's l2: 34461.1
[220]	valid_0's l2: 28120.9
[240]	valid_0's l2: 23235.9
[260]	valid_0's l2: 21061.5
[280]	valid_0's l2: 19159.1
[300]	valid_0's l2: 18806.6
[320]	valid_0's l2: 16639.5
[340]	valid_0's l2: 18176.4
[360]	valid_0's l2: 20069.8
[380]	valid_0's l2: 16296.3
[400]	valid_0's l2: 17280.2
[420]	valid_0's l2: 15922.4
[440]	valid_0's l2: 16964.7
[460]	valid_0's l2: 16151.8
[480]	valid_0's l2: 13422.2
[500]	valid_0's l2: 13421.7
학습용 : 4, 테스트용 : 6
MAE : 70.868102
MSE : 13421.670981
RMSE : 115.851936
NMAE : 0.553481

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 1104



[20]	valid_0's l2: 109289
[40]	valid_0's l2: 90765.2
[60]	valid_0's l2: 51376.1
[80]	valid_0's l2: 58296.7
[100]	valid_0's l2: 40444.1
[120]	valid_0's l2: 51834.1
[140]	valid_0's l2: 49580.2
[160]	valid_0's l2: 43782.1
[180]	valid_0's l2: 29910.2
[200]	valid_0's l2: 34509.3
[220]	valid_0's l2: 28009.2
[240]	valid_0's l2: 23097.9
[260]	valid_0's l2: 20926.5
[280]	valid_0's l2: 19011.3
[300]	valid_0's l2: 18752.1
[320]	valid_0's l2: 16400.9
[340]	valid_0's l2: 17980.3
[360]	valid_0's l2: 19861
[380]	valid_0's l2: 16081.7
[400]	valid_0's l2: 17081.7
[420]	valid_0's l2: 15658.1
[440]	valid_0's l2: 16809.3
[460]	valid_0's l2: 15962.9
[480]	valid_0's l2: 13270.3
[500]	valid_0's l2: 13311
학습용 : 3, 테스트용 : 7
MAE : 70.950529
MSE : 13311.033254
RMSE : 115.373451
NMAE : 0.555605

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 73617, n



[20]	valid_0's l2: 109612
[40]	valid_0's l2: 90526.5
[60]	valid_0's l2: 51649
[80]	valid_0's l2: 58221
[100]	valid_0's l2: 40177.9
[120]	valid_0's l2: 51592.1
[140]	valid_0's l2: 49180.2
[160]	valid_0's l2: 43531.9
[180]	valid_0's l2: 29631.4
[200]	valid_0's l2: 34319.4
[220]	valid_0's l2: 27929.6
[240]	valid_0's l2: 23104
[260]	valid_0's l2: 20840.5
[280]	valid_0's l2: 19061.9
[300]	valid_0's l2: 18725.2
[320]	valid_0's l2: 16426.3
[340]	valid_0's l2: 17964.5
[360]	valid_0's l2: 19852.6
[380]	valid_0's l2: 16098
[400]	valid_0's l2: 17102.5
[420]	valid_0's l2: 15686.9
[440]	valid_0's l2: 16830.3
[460]	valid_0's l2: 16125.8
[480]	valid_0's l2: 13422.2
[500]	valid_0's l2: 13388.9
학습용 : 2, 테스트용 : 8
MAE : 71.228141
MSE : 13388.885116
RMSE : 115.710350
NMAE : 0.549708

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 36808, numbe



[20]	valid_0's l2: 109635
[40]	valid_0's l2: 90267.5
[60]	valid_0's l2: 51390.6
[80]	valid_0's l2: 58231.1
[100]	valid_0's l2: 39893.7
[120]	valid_0's l2: 51284.4
[140]	valid_0's l2: 49032.7
[160]	valid_0's l2: 43512
[180]	valid_0's l2: 29625.8
[200]	valid_0's l2: 34048.5
[220]	valid_0's l2: 27693.8
[240]	valid_0's l2: 22950.4
[260]	valid_0's l2: 20968.2
[280]	valid_0's l2: 18936.4
[300]	valid_0's l2: 18667.9
[320]	valid_0's l2: 16428.4
[340]	valid_0's l2: 18021.3
[360]	valid_0's l2: 19886.9
[380]	valid_0's l2: 16166.6
[400]	valid_0's l2: 17179.7
[420]	valid_0's l2: 15806.8
[440]	valid_0's l2: 16916.2
[460]	valid_0's l2: 16113.8
[480]	valid_0's l2: 13432.1
[500]	valid_0's l2: 13444.6
학습용 : 1, 테스트용 : 9
MAE : 71.295117
MSE : 13444.592480
RMSE : 115.950819
NMAE : 0.549721

실행 시간 : 741.032


In [20]:
print("test_size length : ", len(test_size))
print("MAE length :", len(MAE))
print("MSE length :", len(MSE))
print("RMSE length :", len(RMSE))
print("NMAE length :", len(NMAE))

test_size length :  9
MAE length : 9
MSE length : 9
RMSE length : 9
NMAE length : 9


In [21]:
data_dict = {"test_size" : test_size, "MAE" : MAE, "MSE" : MSE, "RMSE" : RMSE, "NMAE" : NMAE}
gbdt_df = pd.DataFrame(data_dict)
gbdt_df

Unnamed: 0,test_size,MAE,MSE,RMSE,NMAE
0,1,45.664662,6216.16675,78.842671,0.468694
1,2,45.637288,5494.134233,74.122427,0.462589
2,3,45.69901,6101.067715,78.109332,0.473359
3,4,45.854999,6261.108012,79.127164,0.452993
4,5,45.875577,6224.219917,78.893725,0.445978
5,6,46.073508,6066.362841,77.886859,0.457113
6,7,46.387438,6105.203351,78.135801,0.452099
7,8,46.667449,6238.523932,78.984327,0.456859
8,9,49.596564,7102.24768,84.274834,0.455944


In [14]:
start_time = time.time()

X = train[["year", "month", "day", "hour", "weekday", "구분_int", "기온(°C)"]]
y = train["공급량"]

model_list = ["LinearRegression", 
              "DecisionTreeRegressor", 
              "RandomForestRegressor", 
              "GradientBoostingRegressor", 
              "xgboost", 
              "lightgbm"]

models = [LinearRegression(), 
         DecisionTreeRegressor(), 
         RandomForestRegressor(n_jobs = -1, random_state = 37), 
         GradientBoostingRegressor(random_state = 37), 
         xgb.XGBRegressor(), 
         lgb.LGBMRegressor()]

train_score = []
test_score = []
MAE = []
MSE = []
RMSE = []
NMAE = []
model_name = []
    
for idx, model in enumerate(models) :
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 77)
    print("model :", model_list[idx])
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    true_y = pd.DataFrame({"true_y" : y_test})
    pred_y = pd.DataFrame({"pred_y" : pred})

    true_y.reset_index(drop = True, inplace = True)
    true_y.reset_index(inplace = True)
    pred_y.reset_index(inplace = True)

    # 결정계수 확인
    print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
    print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

    # 평가 지표
    print("MAE : {:.6f}".format(mean_absolute_error(y_test, pred)))
    print("MSE : {:.6f}".format(mean_squared_error(y_test, pred)))
    print("RMSE : {:.6f}".format(np.sqrt(mean_squared_error(y_test, pred))))
    print("NMAE : {:.6f}".format(nmae(true_y, pred_y)))
    print()
    train_score.append(np.round(model.score(X_train, y_train), 6))
    test_score.append(np.round(model.score(X_test, y_test), 6))
    MAE.append(np.round(mean_absolute_error(y_test, pred), 6))
    MSE.append(np.round(mean_squared_error(y_test, pred), 6))
    RMSE.append(np.round(np.sqrt(mean_squared_error(y_test, pred)), 6))
    NMAE.append(np.round(nmae(true_y, pred_y), 6))
    model_name.append(model_list[idx])
        
print("실행 시간 : {:.3f}".format(time.time() - start_time))

model : LinearRegression
학습용 데이터 결정계수: 0.322
테스트 데이터 결정계수: 0.321
MAE : 579.143569
MSE : 583219.980380
RMSE : 763.688405
NMAE : 4.203546

model : DecisionTreeRegressor
학습용 데이터 결정계수: 1.000
테스트 데이터 결정계수: 0.982
MAE : 62.086361
MSE : 15373.939242
RMSE : 123.991690
NMAE : 0.077439

model : RandomForestRegressor
학습용 데이터 결정계수: 0.999
테스트 데이터 결정계수: 0.991
MAE : 45.611428
MSE : 7915.180903
RMSE : 88.967302
NMAE : 0.075378

model : GradientBoostingRegressor
학습용 데이터 결정계수: 0.953
테스트 데이터 결정계수: 0.952
MAE : 142.654715
MSE : 40835.358653
RMSE : 202.077606
NMAE : 1.008954

model : xgboost
학습용 데이터 결정계수: 0.991
테스트 데이터 결정계수: 0.991
MAE : 54.486956
MSE : 7546.847455
RMSE : 86.872593
NMAE : 0.490392

model : lightgbm
학습용 데이터 결정계수: 0.987
테스트 데이터 결정계수: 0.987
MAE : 68.074064
MSE : 11428.442172
RMSE : 106.903892
NMAE : 0.536466

실행 시간 : 62.285


In [15]:
print("train_score length :", len(train_score))
print("test_score length :", len(test_score))
print("MAE length :", len(MAE))
print("MSE length :", len(MSE))
print("RMSE length :", len(RMSE))
print("NMAE length :", len(NMAE))
print("model_name length :", len(model_name))

data_dict = {"model" : model_name, "train_score" : train_score, 
            "test_score" : test_score, "MAE" : MAE, "MSE" : MSE, "RMSE" : RMSE, "NMAE" : NMAE}
df = pd.DataFrame(data_dict)

df["train_score"] = df["train_score"] * 100
df["test_score"] = df["test_score"] * 100
df

train_score length : 6
test_score length : 6
MAE length : 6
MSE length : 6
RMSE length : 6
NMAE length : 6
model_name length : 6


Unnamed: 0,model,train_score,test_score,MAE,MSE,RMSE,NMAE
0,LinearRegression,32.2446,32.1294,579.143569,583219.98038,763.688405,4.203546
1,DecisionTreeRegressor,100.0,98.2109,62.086361,15373.939242,123.99169,0.077439
2,RandomForestRegressor,99.8569,99.0789,45.611428,7915.180903,88.967302,0.075378
3,GradientBoostingRegressor,95.2538,95.2479,142.654715,40835.358653,202.077606,1.008954
4,xgboost,99.1449,99.1218,54.486956,7546.847455,86.872593,0.490392
5,lightgbm,98.6545,98.67,68.074064,11428.442172,106.903892,0.536466


In [16]:
df.to_csv("../CSV/model_compare.csv", index = False)