In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor   # 앙상블(의사결정트리 확장판)
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
train = pd.read_csv("../CSV/train_data.csv")
test = pd.read_csv("../CSV/pred_test.csv")

print("train shape : {}, test shape : {}".format(train.shape, test.shape))

train shape : (368088, 11), test shape : (15120, 11)


In [3]:
train.head(3)

Unnamed: 0,연월일,시간,일시,year,month,day,hour,weekday,구분,공급량,기온(°C)
0,2013-01-01,1,2013-01-01 00:00:00,2013,1,1,0,1,A,2497.129,-8.3
1,2013-01-01,2,2013-01-01 01:00:00,2013,1,1,1,1,A,2363.265,-8.5
2,2013-01-01,3,2013-01-01 02:00:00,2013,1,1,2,1,A,2258.505,-8.4


### LinearRegression

In [19]:
X = train[["year", "day", "hour", "weekday", "기온(°C)"]]
y = train["공급량"]

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    pred = lr_model.predict(X_test)

    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
    print("학습용 데이터 결정계수: {:.3f}".format(lr_model.score(X_train, y_train)))
    print("테스트 데이터 결정계수: {:.3f}".format(lr_model.score(X_test, y_test)))
    print("MSE :", mean_squared_error(y_test, pred))
    print()
    
# model.feature_importances_

학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.317
MSE : 594131.6992757492

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.317
MSE : 588297.6921836854

학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.319
MSE : 586188.7096521474

학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.318
MSE : 585879.8443111076

학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.319
테스트 데이터 결정계수: 0.318
MSE : 583847.1132136184

학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.319
테스트 데이터 결정계수: 0.318
MSE : 583767.2731552204

학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.317
테스트 데이터 결정계수: 0.319
MSE : 584135.2508890156

학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.317
테스트 데이터 결정계수: 0.319
MSE : 585445.3466369563

학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.319
테스트 데이터 결정계수: 0.318
MSE : 585880.6420699483



### 다양한 모델 비교
* LinearRegression
* DecisionTreeRegressor
* RandomForestRegressor
* GradientBoostingRegressor

In [5]:
start_time = time.time()

X = train[["year", "day", "hour", "weekday", "기온(°C)"]]
y = train["공급량"]

model_list = ["LinearRegression", 
              "DecisionTreeRegressor", 
              "RandomForestRegressor", 
              "GradientBoostingRegressor"]

models = [LinearRegression(), 
         DecisionTreeRegressor(), 
         RandomForestRegressor(n_jobs = -1, random_state = 37), 
         GradientBoostingRegressor(random_state = 37)]

mse_score = []
model_score = []
 
    
for model in models :
    temp = []
    temp_s = []
    for i in range(1, 10, 1) :
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                           test_size = i / 10,
                                                           random_state = 77)
        print("model :", model)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        # 결정계수 확인
        print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
        print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
        print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

        # MSE(Mean Squared Error) 
        print("MSE :", mean_squared_error(y_test, pred))
        print()
        temp.append(mean_squared_error(y_test, pred))
        temp_s.append(model.score(X_test, y_test))
    mse_score.append(temp)
    model_score.append(temp_s)
    
print("실행 시간 : {:.3f}".format(time.time() - start_time))

model : LinearRegression()
학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.317
MSE : 594131.6992757492

model : LinearRegression()
학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.317
MSE : 588297.6921836854

model : LinearRegression()
학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.319
MSE : 586188.7096521474

model : LinearRegression()
학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.318
테스트 데이터 결정계수: 0.318
MSE : 585879.8443111076

model : LinearRegression()
학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.319
테스트 데이터 결정계수: 0.318
MSE : 583847.1132136184

model : LinearRegression()
학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.319
테스트 데이터 결정계수: 0.318
MSE : 583767.2731552204

model : LinearRegression()
학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.317
테스트 데이터 결정계수: 0.319
MSE : 584135.2508890156

model : LinearRegression()
학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.317
테스트 데이터 결정계수: 0.319
MSE : 585445.3466369563

model : LinearRegression()
학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.319
테스트 데이터 결정계수: 0.318
MSE : 585880.6420699483

model : De

In [6]:
print("mse_score length : {}, model_score length : {}".format(len(mse_score), len(model_score)))
print(mse_score)

mse_score length : 4, model_score length : 4
[[594131.6992757492, 588297.6921836854, 586188.7096521474, 585879.8443111076, 583847.1132136184, 583767.2731552204, 584135.2508890156, 585445.3466369563, 585880.6420699483], [733506.0041798471, 744395.7857744986, 761420.7653199912, 795146.9419359723, 840817.4979264897, 901273.6754107361, 962731.800985074, 1027569.0212232289, 1073929.221626674], [733428.8186535283, 740388.1893688901, 750014.2073192137, 766993.3938356462, 784129.6489210617, 795200.3381343131, 792999.8345571847, 770147.8509977901, 706129.4111261407], [541157.4978744331, 537473.9879167024, 535276.2419629912, 534759.3253688124, 533493.5269865148, 533777.5593262803, 534615.3965655157, 535556.0169780166, 537646.4121866528]]


In [7]:
df = pd.DataFrame({"test_size" : range(1, 10, 1)})
for i in range(0, 4, 1) :
    df[model_list[i] + "_score"] = model_score[i]
for i in range(0, 4, 1) :
    df[model_list[i] + "_MSE"] = mse_score[i]

In [8]:
df

Unnamed: 0,test_size,LinearRegression_score,DecisionTreeRegressor_score,RandomForestRegressor_score,GradientBoostingRegressor_score,LinearRegression_MSE,DecisionTreeRegressor_MSE,RandomForestRegressor_MSE,GradientBoostingRegressor_MSE
0,1,0.316583,0.156264,0.156352,0.377518,594131.699276,733506.0,733428.818654,541157.497874
1,2,0.317131,0.135939,0.140591,0.376124,588297.692184,744395.8,740388.189369,537473.987917
2,3,0.318667,0.114993,0.128251,0.377843,586188.709652,761420.8,750014.207319,535276.241963
3,4,0.317822,0.074159,0.10694,0.377345,585879.844311,795146.9,766993.393836,534759.325369
4,5,0.317606,0.017262,0.083518,0.376459,583847.113214,840817.5,784129.648921,533493.526987
5,6,0.317983,-0.052961,0.070965,0.376386,583767.273155,901273.7,795200.338134,533777.559326
6,7,0.318728,-0.122826,0.075131,0.376483,584135.250889,962731.8,792999.834557,534615.396566
7,8,0.31853,-0.196111,0.103533,0.376602,585445.346637,1027569.0,770147.850998,535556.016978
8,9,0.318052,-0.250021,0.178086,0.374196,585880.64207,1073929.0,706129.411126,537646.412187


In [9]:
df.describe()

Unnamed: 0,test_size,LinearRegression_score,DecisionTreeRegressor_score,RandomForestRegressor_score,GradientBoostingRegressor_score,LinearRegression_MSE,DecisionTreeRegressor_MSE,RandomForestRegressor_MSE,GradientBoostingRegressor_MSE
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,5.0,0.3179,-0.0137,0.11593,0.376551,586397.063487,871199.0,759936.854768,535972.885018
std,2.738613,0.000718,0.149621,0.037465,0.001061,3229.825906,126802.3,29905.16383,2420.626911
min,1.0,0.316583,-0.250021,0.070965,0.374196,583767.273155,733506.0,706129.411126,533493.526987
25%,3.0,0.317606,-0.122826,0.083518,0.376386,584135.250889,761420.8,740388.189369,534615.396566
50%,5.0,0.317983,0.017262,0.10694,0.376483,585879.844311,840817.5,766993.393836,535276.241963
75%,7.0,0.31853,0.114993,0.140591,0.377345,586188.709652,962731.8,784129.648921,537473.987917
max,9.0,0.318728,0.156264,0.178086,0.377843,594131.699276,1073929.0,795200.338134,541157.497874


### 최종 모델 선택

In [13]:
X = train[["year", "day", "hour", "weekday", "기온(°C)"]]
y = train["공급량"]

i = 3

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = i / 10,
                                                   random_state = 77)

model = GradientBoostingRegressor(random_state = 37)
model.fit(X_train, y_train)
pred = model.predict(X_test)

# 결정계수 확인
print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
print("MSE :", mean_squared_error(y_test, pred))
print()
    
# model.feature_importances_

학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.380
테스트 데이터 결정계수: 0.378
MSE : 535276.2419629912



In [14]:
submission = pd.read_csv('../CSV/가스공급량 수요예측 모델개발 data/sample_submission.csv')

In [15]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0


In [20]:
test_x = test[["year", "day", "hour", "weekday", "기온"]]
pred = model.predict(test_x)
submission["공급량"] = pred

In [21]:
pred

array([1657.49552454, 1441.64069916, 1403.13352574, ...,  854.33075808,
        763.29554166,  696.32823744])

In [22]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,1657.495525
1,2019-01-01 02 A,1441.640699
2,2019-01-01 03 A,1403.133526


In [23]:
submission.tail(3)

Unnamed: 0,일자|시간|구분,공급량
15117,2019-03-31 22 H,854.330758
15118,2019-03-31 23 H,763.295542
15119,2019-03-31 24 H,696.328237


In [24]:
submission.to_csv("../CSV/Submission/02_first_model.csv", index = False)