In [17]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor   # 앙상블(의사결정트리 확장판)
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
train = pd.read_csv("../CSV/train_data.csv")
test = pd.read_csv("../CSV/pred_test.csv")

print("train shape : {}, test shape : {}".format(train.shape, test.shape))

train shape : (368088, 11), test shape : (15120, 11)


In [8]:
train.head(3)

Unnamed: 0,연월일,시간,일시,year,month,day,hour,weekday,구분,공급량,기온(°C)
0,2013-01-01,1,2013-01-01 00:00:00,2013,1,1,0,1,A,2497.129,-8.3
1,2013-01-01,2,2013-01-01 01:00:00,2013,1,1,1,1,A,2363.265,-8.5
2,2013-01-01,3,2013-01-01 02:00:00,2013,1,1,2,1,A,2258.505,-8.4


### LinearRegression

In [15]:
X = train[["year", "day", "hour", "weekday"]]
y = train["공급량"]

for i in range(1, 10, 1) :
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size = i / 10,
                                                       random_state = 77)

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    pred = lr_model.predict(X_test)

    # 결정계수 확인
    print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
    print("학습용 데이터 결정계수: {:.3f}".format(lr_model.score(X_train, y_train)))
    print("테스트 데이터 결정계수: {:.3f}".format(lr_model.score(X_test, y_test)))
    print("MSE :", mean_squared_error(y_test, pred))
    print()
    
# model.feature_importances_

학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 858995.3820678939

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 851242.2440710817

학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.012
테스트 데이터 결정계수: 0.012
MSE : 850409.6820350793

학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 848899.8309086943

학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.012
테스트 데이터 결정계수: 0.012
MSE : 845726.3954807592

학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 845974.2281729488

학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 847413.3274659554

학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.012
테스트 데이터 결정계수: 0.011
MSE : 849345.726313863

학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.013
테스트 데이터 결정계수: 0.011
MSE : 849408.0273866793



### 다양한 모델 비교
* LinearRegression
* DecisionTreeRegressor
* RandomForestRegressor
* GradientBoostingRegressor

In [29]:
start_time = time.time()

X = train[["year", "day", "hour", "weekday"]]
y = train["공급량"]

model_list = ["LinearRegression", 
              "DecisionTreeRegressor", 
              "RandomForestRegressor", 
              "GradientBoostingRegressor"]

models = [LinearRegression(), 
         DecisionTreeRegressor(), 
         RandomForestRegressor(n_jobs = -1, random_state = 37), 
         GradientBoostingRegressor(random_state = 37)]

mse_score = []
model_score = []
 
    
for model in models :
    temp = []
    temp_s = []
    for i in range(1, 10, 1) :
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                           test_size = i / 10,
                                                           random_state = 77)
        print("model :", model)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        # 결정계수 확인
        print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
        print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
        print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

        # MSE(Mean Squared Error) 
        print("MSE :", mean_squared_error(y_test, pred))
        print()
        temp.append(mean_squared_error(y_test, pred))
        temp_s.append(model.score(X_test, y_test))
    mse_score.append(temp)
    model_score.append(temp_s)
    
print("실행 시간 : {:.3f}".format(time.time() - start_time))

model : LinearRegression()
학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 858995.3820678939

model : LinearRegression()
학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 851242.2440710817

model : LinearRegression()
학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.012
테스트 데이터 결정계수: 0.012
MSE : 850409.6820350793

model : LinearRegression()
학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 848899.8309086943

model : LinearRegression()
학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.012
테스트 데이터 결정계수: 0.012
MSE : 845726.3954807592

model : LinearRegression()
학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 845974.2281729488

model : LinearRegression()
학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.011
테스트 데이터 결정계수: 0.012
MSE : 847413.3274659554

model : LinearRegression()
학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.012
테스트 데이터 결정계수: 0.011
MSE : 849345.726313863

model : LinearRegression()
학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.013
테스트 데이터 결정계수: 0.011
MSE : 849408.0273866793

model : Dec

In [30]:
print("mse_score length : {}, model_score length : {}".format(len(mse_score), len(model_score)))
print(mse_score)

mse_score length : 4, model_score length : 4
[[858995.3820678939, 851242.2440710817, 850409.6820350793, 848899.8309086943, 845726.3954807592, 845974.2281729488, 847413.3274659554, 849345.726313863, 849408.0273866793], [816575.5151588012, 819157.7464558868, 829773.4777033634, 846436.4179266624, 869475.4270735643, 909288.4856918071, 990037.8717668896, 1121985.5152750846, 1400949.3888838992], [817026.3026619359, 819444.2249508584, 829472.3405244495, 844938.1135160619, 864012.1656653091, 894025.1095768389, 944270.3984435229, 1001481.6633256169, 1059799.7026063986], [831334.1042618387, 825511.2980920561, 825585.6179036476, 823813.817876769, 820471.0264255633, 820913.9695055832, 822415.1421751302, 824308.6286750595, 824988.3264692031]]


In [39]:
df = pd.DataFrame({"test_size" : range(1, 10, 1)})
for i in range(0, 4, 1) :
    df[model_list[i] + "_score"] = model_score[i]
for i in range(0, 4, 1) :
    df[model_list[i] + "_MSE"] = mse_score[i]

In [40]:
df

Unnamed: 0,test_size,LinearRegression_score,DecisionTreeRegressor_score,RandomForestRegressor_score,GradientBoostingRegressor_score,LinearRegression_MSE,DecisionTreeRegressor_MSE,RandomForestRegressor_MSE,GradientBoostingRegressor_MSE
0,1,0.011916,0.06071,0.060192,0.043734,858995.382068,816575.5,817026.3,831334.104262
1,2,0.011916,0.049159,0.048826,0.041784,851242.244071,819157.7,819444.2,825511.298092
2,3,0.011561,0.035546,0.035896,0.040414,850409.682035,829773.5,829472.3,825585.617904
3,4,0.011571,0.014439,0.016184,0.04078,848899.830909,846436.4,844938.1,823813.817877
4,5,0.011525,-0.016233,-0.009848,0.041043,845726.395481,869475.4,864012.2,820471.026426
5,6,0.011646,-0.062325,-0.044492,0.040924,845974.228173,909288.5,894025.1,820913.969506
6,7,0.011669,-0.154672,-0.101294,0.040824,847413.327466,990037.9,944270.4,822415.142175
7,8,0.011345,-0.306013,-0.165745,0.040488,849345.726314,1121986.0,1001482.0,824308.628675
8,9,0.011314,-0.630663,-0.233575,0.039738,849408.027387,1400949.0,1059800.0,824988.326469


In [41]:
df.describe()

Unnamed: 0,test_size,LinearRegression_score,DecisionTreeRegressor_score,RandomForestRegressor_score,GradientBoostingRegressor_score,LinearRegression_MSE,DecisionTreeRegressor_MSE,RandomForestRegressor_MSE,GradientBoostingRegressor_MSE
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,5.0,0.011607,-0.112228,-0.043762,0.041081,849712.760434,955964.4,897163.3,824371.325709
std,2.738613,0.000212,0.227396,0.102888,0.001135,3953.410489,194441.7,86914.16,3219.900309
min,1.0,0.011314,-0.630663,-0.233575,0.039738,845726.395481,816575.5,817026.3,820471.026426
25%,3.0,0.011525,-0.154672,-0.101294,0.040488,847413.327466,829773.5,829472.3,822415.142175
50%,5.0,0.011571,-0.016233,-0.009848,0.040824,849345.726314,869475.4,864012.2,824308.628675
75%,7.0,0.011669,0.035546,0.035896,0.041043,850409.682035,990037.9,944270.4,825511.298092
max,9.0,0.011916,0.06071,0.060192,0.043734,858995.382068,1400949.0,1059800.0,831334.104262


### 최종 모델 선택

In [50]:
X = train[["year", "day", "hour", "weekday"]]
y = train["공급량"]

i = 1

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = i / 10,
                                                   random_state = 77)

model = DecisionTreeRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)

# 결정계수 확인
print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
print("MSE :", mean_squared_error(y_test, pred))
print()
    
# model.feature_importances_

학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.199
테스트 데이터 결정계수: 0.061
MSE : 816575.5151588012



In [46]:
submission = pd.read_csv('../CSV/가스공급량 수요예측 모델개발 data/sample_submission.csv')

In [47]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0


In [51]:
test_x = test[["year", "day", "hour", "weekday"]]
pred = model.predict(test_x)
submission["공급량"] = pred

In [52]:
pred

array([ 494.74      ,  378.35128571,  308.59383333, ..., 1068.20209091,
       1403.82061538, 1379.48353846])

In [53]:
submission.head(3)

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,494.74
1,2019-01-01 02 A,378.351286
2,2019-01-01 03 A,308.593833


In [54]:
submission.tail(3)

Unnamed: 0,일자|시간|구분,공급량
15117,2019-03-31 22 H,1068.202091
15118,2019-03-31 23 H,1403.820615
15119,2019-03-31 24 H,1379.483538


In [55]:
submission.to_csv("../CSV/Submission/01_base_model.csv", index = False)