## 날씨 예측 모델
* 모든 train data 사용
* train shape : (368088, 11)

In [34]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor   # 앙상블(의사결정트리 확장판)
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb
import lightgbm as lgb

In [35]:
train = pd.read_csv("../CSV/train_data.csv", parse_dates=["일시"])

In [36]:
train.head(3)

Unnamed: 0,연월일,시간,일시,year,month,day,hour,weekday,구분,구분_int,공급량,기온(°C)
0,2013-01-01,1,2013-01-01 00:00:00,2013,1,1,0,1,A,0,2497.129,-8.3
1,2013-01-01,2,2013-01-01 01:00:00,2013,1,1,1,1,A,0,2363.265,-8.5
2,2013-01-01,3,2013-01-01 02:00:00,2013,1,1,2,1,A,0,2258.505,-8.4


In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368088 entries, 0 to 368087
Data columns (total 12 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   연월일      368088 non-null  object        
 1   시간       368088 non-null  int64         
 2   일시       368088 non-null  datetime64[ns]
 3   year     368088 non-null  int64         
 4   month    368088 non-null  int64         
 5   day      368088 non-null  int64         
 6   hour     368088 non-null  int64         
 7   weekday  368088 non-null  int64         
 8   구분       368088 non-null  object        
 9   구분_int   368088 non-null  int64         
 10  공급량      368088 non-null  float64       
 11  기온(°C)   368088 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(7), object(2)
memory usage: 33.7+ MB


In [38]:
train.shape

(368088, 12)

In [6]:
start_time = time.time()

# 2013 ~ 2018
X = train[["year", "month", "day", "hour", "weekday", "구분_int"]]
y = train["기온(°C)"]

model_list = ["LinearRegression", 
              "DecisionTreeRegressor", 
              "RandomForestRegressor", 
              "GradientBoostingRegressor", 
              "xgboost", 
              "lightgbm"]

models = [LinearRegression(), 
         DecisionTreeRegressor(), 
         RandomForestRegressor(n_jobs = -1, random_state = 37), 
         GradientBoostingRegressor(random_state = 37), 
         xgb.XGBRegressor(), 
         lgb.LGBMRegressor()]

test_size = []
train_score = []
test_score = []
MAE = []
MSE = []
RMSE = []
model_name = []
    
for idx, model in enumerate(models) :
    for i in range(1, 10, 1) :
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                           test_size = i / 10,
                                                           random_state = 77)
        print("model :", model_list[idx])
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        # 결정계수 확인
        print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
        print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
        print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

        # 평가 지표
        print("MAE :", mean_absolute_error(y_test, pred))
        print("MSE :", mean_squared_error(y_test, pred))
        print("RMSE : ", np.sqrt(mean_squared_error(y_test, pred)))
        print()
        test_size.append(i)
        train_score.append(model.score(X_train, y_train))
        test_score.append(model.score(X_test, y_test))
        MAE.append(mean_absolute_error(y_test, pred))
        MSE.append(mean_squared_error(y_test, pred))
        RMSE.append(np.sqrt(mean_squared_error(y_test, pred)))
        model_name.append(model_list[idx])
        
print("실행 시간 : {:.3f}".format(time.time() - start_time))

model : LinearRegression
학습용 : 9, 테스트용 : 1
학습용 데이터 결정계수: 0.069
테스트 데이터 결정계수: 0.071
MAE : 9.305141034096566
MSE : 118.02569759877431
RMSE :  10.863963254667897

model : LinearRegression
학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.069
테스트 데이터 결정계수: 0.067
MAE : 9.298272040674076
MSE : 117.93302128977273
RMSE :  10.859697108564895

model : LinearRegression
학습용 : 7, 테스트용 : 3
학습용 데이터 결정계수: 0.070
테스트 데이터 결정계수: 0.067
MAE : 9.301098803942113
MSE : 118.03339563465927
RMSE :  10.8643175411371

model : LinearRegression
학습용 : 6, 테스트용 : 4
학습용 데이터 결정계수: 0.070
테스트 데이터 결정계수: 0.068
MAE : 9.305788587478053
MSE : 118.17225532082799
RMSE :  10.870706293559218

model : LinearRegression
학습용 : 5, 테스트용 : 5
학습용 데이터 결정계수: 0.070
테스트 데이터 결정계수: 0.068
MAE : 9.305608903693207
MSE : 118.09094976323249
RMSE :  10.866965987028417

model : LinearRegression
학습용 : 4, 테스트용 : 6
학습용 데이터 결정계수: 0.069
테스트 데이터 결정계수: 0.069
MAE : 9.302286892214498
MSE : 118.00651536465494
RMSE :  10.863080381027057

model : LinearRegression
학습용 : 3, 테스트용 : 7

model : lightgbm
학습용 : 3, 테스트용 : 7
학습용 데이터 결정계수: 0.950
테스트 데이터 결정계수: 0.949
MAE : 1.9885516714927514
MSE : 6.443196916383105
RMSE :  2.538345310705993

model : lightgbm
학습용 : 2, 테스트용 : 8
학습용 데이터 결정계수: 0.949
테스트 데이터 결정계수: 0.948
MAE : 2.0042918233446994
MSE : 6.591448558278562
RMSE :  2.5673816541913985

model : lightgbm
학습용 : 1, 테스트용 : 9
학습용 데이터 결정계수: 0.949
테스트 데이터 결정계수: 0.948
MAE : 2.0192983860100675
MSE : 6.664531024988275
RMSE :  2.5815752991125933

실행 시간 : 324.666


In [10]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 27)

for model in models :
    cross_score = cross_val_score(model, X, y, cv = kfold)
    print("model :", model)
    print("교차 검증별 정확도:", cross_score)
    print("평균 검증 정확도 :", np.mean(cross_score))
    print()

model : LinearRegression()
교차 검증별 정확도: [0.06650532 0.07230436 0.06595245 0.07067328 0.07146943 0.07040688
 0.07281801 0.06612531 0.06451959 0.06795426]
평균 검증 정확도 : 0.06887288877536538

model : DecisionTreeRegressor()
교차 검증별 정확도: [0.99999958 0.99999632 0.99999873 0.9999993  0.99999949 0.99999948
 0.99999993 0.99999982 0.99999976 0.99999766]
평균 검증 정확도 : 0.9999990067269916

model : RandomForestRegressor(n_jobs=-1, random_state=37)
교차 검증별 정확도: [0.99999355 0.9999936  0.99999158 0.99999367 0.99999388 0.99999403
 0.99999418 0.99999351 0.99999332 0.99999429]
평균 검증 정확도 : 0.9999935620886344

model : GradientBoostingRegressor(random_state=37)
교차 검증별 정확도: [0.91375695 0.91126189 0.91275957 0.91243619 0.91289176 0.91333275
 0.91269854 0.91268985 0.91288828 0.91195548]
평균 검증 정확도 : 0.9126671242223254

model : XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type

In [11]:
print("test_size length : ", len(test_size))
print("train_score length :", len(train_score))
print("test_score length :", len(test_score))
print("MAE length :", len(MAE))
print("MSE length :", len(MSE))
print("RMSE length :", len(RMSE))
print("model_name length :", len(model_name))

test_size length :  54
train_score length : 54
test_score length : 54
MAE length : 54
MSE length : 54
RMSE length : 54
model_name length : 54


In [12]:
data_dict = {"model" : model_name, "test_size" : test_size, "train_score" : train_score, 
            "test_score" : test_score, "MAE" : MAE, "MSE" : MSE, "RMSE" : RMSE}
df = pd.DataFrame(data_dict)

df["train_score"] = df["train_score"] * 100
df["test_score"] = df["test_score"] * 100
df

Unnamed: 0,model,test_size,train_score,test_score,MAE,MSE,RMSE
0,LinearRegression,1,6.868515,7.110992,9.305141,118.025698,10.863963
1,LinearRegression,2,6.92765,6.745599,9.298272,117.933021,10.859697
2,LinearRegression,3,6.969441,6.708682,9.301099,118.033396,10.864318
3,LinearRegression,4,6.977111,6.759182,9.305789,118.172255,10.870706
4,LinearRegression,5,6.974297,6.80594,9.305609,118.09095,10.866966
5,LinearRegression,6,6.938073,6.856607,9.302287,118.006515,10.86308
6,LinearRegression,7,6.986716,6.845119,9.30282,118.076861,10.866318
7,LinearRegression,8,6.912042,6.883818,9.31145,118.290622,10.876149
8,LinearRegression,9,6.607928,6.907995,9.32326,118.343192,10.878566
9,DecisionTreeRegressor,1,100.0,99.99981,0.000334,0.000241,0.015523


In [13]:
df.describe()

Unnamed: 0,test_size,train_score,test_score,MAE,MSE,RMSE
count,54.0,54.0,54.0,54.0,54.0,54.0
mean,5.0,81.774642,81.651454,2.605269,23.264288,3.21942
std,2.606233,33.934106,33.89957,3.156162,42.980952,3.625329
min,1.0,6.607928,6.708682,0.000334,0.000241,0.015523
25%,3.0,91.281592,91.240766,0.275726,0.368077,0.605512
50%,5.0,96.274055,96.009994,1.728073,5.05558,2.228322
75%,7.0,99.992987,99.709809,2.588402,11.096402,3.331125
max,9.0,100.0,99.99981,9.32326,118.343192,10.878566


In [14]:
df.groupby("model").min()

Unnamed: 0_level_0,test_size,train_score,test_score,MAE,MSE,RMSE
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DecisionTreeRegressor,1,100.0,98.297395,0.000334,0.000241,0.015523
GradientBoostingRegressor,1,91.124378,91.186215,2.582127,11.005688,3.317482
LinearRegression,1,6.607928,6.708682,9.298272,117.933021,10.859697
RandomForestRegressor,1,99.854095,99.016057,0.011883,0.000772,0.027793
lightgbm,1,94.873116,94.757497,1.978179,6.393761,2.528589
xgboost,1,97.524669,97.075792,1.351378,3.123557,1.767359


### 최종 모델 선정

In [49]:
# 2013 ~ 2018

X = train[["year", "month", "day", "hour", "weekday", "구분_int"]]
y = train["기온(°C)"]

i = 2

kfold = KFold(n_splits = 10, shuffle = True, random_state = 27)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = i / 10,
                                                    random_state = 77)
model = xgb.XGBRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)

# 결정계수 확인
print("학습용 : {}, 테스트용 : {}".format(10 - i, i))
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))

# 평가 지표
print("MAE :", mean_absolute_error(y_test, pred))
print("MSE :", mean_squared_error(y_test, pred))
print("RMSE : ", np.sqrt(mean_squared_error(y_test, pred)))

cross_score_train = cross_val_score(model, X_train, y_train, cv = kfold)
cross_score_test = cross_val_score(model, X_test, y_test, cv = kfold)
# print("교차 검증별 정확도:", cross_score)
print("train 평균 검증 정확도 :", np.mean(cross_score_train))
print("test 평균 검증 정확도 :", np.mean(cross_score_test))

학습용 : 8, 테스트용 : 2
학습용 데이터 결정계수: 0.976
테스트 데이터 결정계수: 0.975
MAE : 1.353038062665936
MSE : 3.123557098020092
RMSE :  1.767358791536142
train 평균 검증 정확도 : 0.9752203142491009
test 평균 검증 정확도 : 0.9733143141985583


### 테스트 데이터 불러오기

In [40]:
test = pd.read_csv("../CSV/new_test.csv")

In [41]:
test.head(3)

Unnamed: 0,일자|시간|구분,일자,시간,구분,구분_int,일시,year,month,day,hour,weekday
0,2019-01-01 01 A,2019-01-01,1,A,0,2019-01-01 00:00:00,2019,1,1,0,1
1,2019-01-01 02 A,2019-01-01,2,A,0,2019-01-01 01:00:00,2019,1,1,1,1
2,2019-01-01 03 A,2019-01-01,3,A,0,2019-01-01 02:00:00,2019,1,1,2,1


In [42]:
test.tail(3)

Unnamed: 0,일자|시간|구분,일자,시간,구분,구분_int,일시,year,month,day,hour,weekday
15117,2019-03-31 22 H,2019-03-31,22,H,6,2019-03-31 21:00:00,2019,3,31,21,6
15118,2019-03-31 23 H,2019-03-31,23,H,6,2019-03-31 22:00:00,2019,3,31,22,6
15119,2019-03-31 24 H,2019-03-31,24,H,6,2019-03-31 23:00:00,2019,3,31,23,6


In [43]:
test_x = test[["year", "month", "day", "hour", "weekday", "구분_int"]]

### 19년도 기온 예측

In [44]:
pred = model.predict(test_x)
test["기온(°C)"] = np.round(pred, 1) # 예측한 기온을 소수 첫째자리까지 표시

In [46]:
test.head(50)

Unnamed: 0,일자|시간|구분,일자,시간,구분,구분_int,일시,year,month,day,hour,weekday,기온(°C)
0,2019-01-01 01 A,2019-01-01,1,A,0,2019-01-01 00:00:00,2019,1,1,0,1,-6.6
1,2019-01-01 02 A,2019-01-01,2,A,0,2019-01-01 01:00:00,2019,1,1,1,1,-6.7
2,2019-01-01 03 A,2019-01-01,3,A,0,2019-01-01 02:00:00,2019,1,1,2,1,-7.1
3,2019-01-01 04 A,2019-01-01,4,A,0,2019-01-01 03:00:00,2019,1,1,3,1,-6.7
4,2019-01-01 05 A,2019-01-01,5,A,0,2019-01-01 04:00:00,2019,1,1,4,1,-6.4
5,2019-01-01 06 A,2019-01-01,6,A,0,2019-01-01 05:00:00,2019,1,1,5,1,-6.2
6,2019-01-01 07 A,2019-01-01,7,A,0,2019-01-01 06:00:00,2019,1,1,6,1,-6.0
7,2019-01-01 08 A,2019-01-01,8,A,0,2019-01-01 07:00:00,2019,1,1,7,1,-6.3
8,2019-01-01 09 A,2019-01-01,9,A,0,2019-01-01 08:00:00,2019,1,1,8,1,-6.3
9,2019-01-01 10 A,2019-01-01,10,A,0,2019-01-01 09:00:00,2019,1,1,9,1,-5.9


In [47]:
train.head(50)

Unnamed: 0,연월일,시간,일시,year,month,day,hour,weekday,구분,구분_int,공급량,기온(°C)
0,2013-01-01,1,2013-01-01 00:00:00,2013,1,1,0,1,A,0,2497.129,-8.3
1,2013-01-01,2,2013-01-01 01:00:00,2013,1,1,1,1,A,0,2363.265,-8.5
2,2013-01-01,3,2013-01-01 02:00:00,2013,1,1,2,1,A,0,2258.505,-8.4
3,2013-01-01,4,2013-01-01 03:00:00,2013,1,1,3,1,A,0,2243.969,-8.1
4,2013-01-01,5,2013-01-01 04:00:00,2013,1,1,4,1,A,0,2344.105,-8.2
5,2013-01-01,6,2013-01-01 05:00:00,2013,1,1,5,1,A,0,2390.961,-8.2
6,2013-01-01,7,2013-01-01 06:00:00,2013,1,1,6,1,A,0,2378.457,-8.6
7,2013-01-01,8,2013-01-01 07:00:00,2013,1,1,7,1,A,0,2518.921,-8.3
8,2013-01-01,9,2013-01-01 08:00:00,2013,1,1,8,1,A,0,2706.481,-7.9
9,2013-01-01,10,2013-01-01 09:00:00,2013,1,1,9,1,A,0,2832.057,-7.0


### 예측한 데이터 저장
* test 데이터 프레임 저장

In [48]:
test.to_csv("../CSV/pred_test.csv", index = False)

In [27]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (15, 10))
sns.lineplot(x = "일자", y = "기온", data = test)

In [None]:
a = train[(train["year"] == 2013) & ((train["month"] == 1) | (train["month"] == 2) | (train["month"] == 3))]

In [None]:
plt.figure(figsize = (15, 10))
sns.lineplot(x = "일시", y = "기온(°C)", data = a)

In [None]:
sns.lineplot(x = "month", y = "기온", data = test)
sns.lineplot(x = "month", y = "기온(°C)", data = a)

In [None]:
sns.lineplot(x = "month", y = "기온(°C)", hue = "year", data = train)