# 7_ParameterTuning & Ensemble

In [1]:
%matplotlib inline
import os
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
train = pd.read_csv("C:/Users/kimch/Desktop/229255_bus_riders_at_rush_hour_data/preprocessed_train.csv")
train = train.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
train.set_index("id", inplace=True)
train["date"] = pd.to_datetime(train.date, format= "%Y-%m-%d")

## Train -> Train, Valid, Test

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# drop_first=True해서 in_out_시내, rain_0 제외하고 train_X column에 넣음
X = train[['bus_route_id','latitude','longitude', '6~7_ride','7~8_ride', '8~9_ride','9~10_ride','10~11_ride','11~12_ride','6~7_takeoff',
           '7~8_takeoff','8~9_takeoff','9~10_takeoff','10~11_takeoff','11~12_takeoff','region_e', 'region_n', 'region_o', 'region_s', 
           'region_w', 'in_out_시외', 'rain_1', 'rain_2', 'rain_3']]

y = train[["18~20_ride"]]

# train:valid:test = 6:2:2
train_X, resid_X, train_y, resid_y = train_test_split(X, y, test_size = 0.4, shuffle=True, random_state= 17)
valid_X, test_X, valid_y, test_y = train_test_split(resid_X, resid_y, test_size = 0.5, shuffle=True)

## Ensemble   
BayesianRidge + LGBM + RandomForest

In [6]:
from sklearn.linear_model import BayesianRidge
import lightgbm as lgb

In [8]:
bayesianRidge = BayesianRidge(normalize=True)  
bayesianRidge.fit(train_X, train_y)

In [9]:
lgbm = lgb.LGBMRegressor(bagging_fraction=0.6,
                              early_stopping_rounds=16,
                              feature_fraction=0.73,
                              learning_rate = 0.07,
                              max_depth = 14,
                              min_child_samples = 20,
                              min_split_gain = 0.0064, 
                              n_estimators = 172,
                              num_leaves = 39,
                              boosting = "dart",
                              metric = "rmse",
                              task = "predict",
                              application = "regression",
                              eval_set = (valid_X, valid_y))

lgbm.fit(train_X, train_y)



In [10]:
from sklearn.ensemble import RandomForestRegressor

In [11]:
rf = RandomForestRegressor(max_depth=8, min_samples_leaf=8, min_samples_split=8, n_estimators=200, random_state=0)
rf.fit(train_X, train_y)

In [12]:
# BayesianRidge 이용해서 예측
bayes_predict = bayesianRidge.predict(train_X).tolist()
# lgbm 이용해서 예측
lgbm_predict = lgbm.predict(train_X).tolist()
# randomforest 이용해서 예측
rf_predict = rf.predict(train_X).tolist()

In [13]:
result = pd.DataFrame({"BayesianRidge": bayes_predict, "LGBM": lgbm_predict, "RandomForest": rf_predict, "real": train_y["18~20_ride"].tolist()})

In [14]:
# 세 개 모델 돌려서 predict한 값 dataframe으로
result

Unnamed: 0,BayesianRidge,LGBM,RandomForest,real
0,0.157001,0.089727,0.127274,0.000000
1,0.026693,0.004856,0.051581,0.000000
2,0.271147,0.171364,0.179487,0.526589
3,0.308462,0.321870,0.404028,0.526589
4,0.107755,0.081554,0.107287,0.000000
...,...,...,...,...
249248,0.945959,0.761340,0.897409,0.959135
249249,0.434387,0.346520,0.385741,0.000000
249250,-0.002809,0.082373,0.107747,0.000000
249251,0.764417,0.736456,0.659016,1.223156


In [15]:
total_predict = []

for i in range(len(result)):
    prediction = 0
    for j in range (3):
        prediction += result.iloc[i, j]
    prediction_mean = round(prediction / 3, 4)
    total_predict.append(prediction_mean)

In [16]:
result["predict"] = total_predict

In [17]:
from numpy import expm1, log1p

In [18]:
# 로그변환해줬기 때문에 역변환 
result["notlog_predict"] = round(expm1(result.predict),0)

In [19]:
# 로그변환한거 역변환하면 왜 정수 안나오지 그냥 반올림 함
result["notlog_real"] = round(expm1(result.real), 0)

In [20]:
result

Unnamed: 0,BayesianRidge,LGBM,RandomForest,real,predict,notlog_predict,notlog_real
0,0.157001,0.089727,0.127274,0.000000,0.1247,0.0,0.0
1,0.026693,0.004856,0.051581,0.000000,0.0277,0.0,0.0
2,0.271147,0.171364,0.179487,0.526589,0.2073,0.0,1.0
3,0.308462,0.321870,0.404028,0.526589,0.3448,0.0,1.0
4,0.107755,0.081554,0.107287,0.000000,0.0989,0.0,0.0
...,...,...,...,...,...,...,...
249248,0.945959,0.761340,0.897409,0.959135,0.8682,1.0,2.0
249249,0.434387,0.346520,0.385741,0.000000,0.3889,0.0,0.0
249250,-0.002809,0.082373,0.107747,0.000000,0.0624,0.0,0.0
249251,0.764417,0.736456,0.659016,1.223156,0.7200,1.0,2.0


In [21]:
from dmba import regressionSummary

In [22]:
# train ; 로그 변환
regressionSummary(result.real, result.predict)


Regression statistics

               Mean Error (ME) : 0.0098
Root Mean Squared Error (RMSE) : 0.2910
     Mean Absolute Error (MAE) : 0.2088


In [23]:
# train : 로그 변환 X -> y값 역변환했을 때 정수가 안나와서 실제 y값 맞는지 모르겠지만 암튼 로그변환했을 때 보다 rmse 더 높아짐
regressionSummary(result.notlog_real, result.notlog_predict)


Regression statistics

               Mean Error (ME) : 0.2082
Root Mean Squared Error (RMSE) : 0.6050
     Mean Absolute Error (MAE) : 0.2865


### valid

In [24]:
# BayesianRidge 이용해서 예측
bayes_predict = bayesianRidge.predict(valid_X).tolist()
# lgbm 이용해서 예측
lgbm_predict = lgbm.predict(valid_X).tolist()
# randomforest 이용해서 예측
rf_predict = rf.predict(valid_X).tolist()

In [25]:
result_valid = pd.DataFrame({"BayesianRidge": bayes_predict, "LGBM": lgbm_predict, "RandomForest": rf_predict, "real": valid_y["18~20_ride"].tolist()})

In [26]:
total_predict = []

for i in range(len(result_valid)):
    prediction = 0
    for j in range (3):
        prediction += result_valid.iloc[i, j]
    prediction_mean = round(prediction / 3, 4)
    total_predict.append(prediction_mean)

In [27]:
result_valid["predict"] = total_predict

In [28]:
# 로그변환해줬기 때문에 역변환 
result_valid["notlog_predict"] = round(expm1(result_valid.predict),0)

In [29]:
# 로그변환한거 역변환하면 왜 정수 안나오지 그냥 반올림 함
result_valid["notlog_real"] = round(expm1(result_valid.real), 0)

In [30]:
result_valid

Unnamed: 0,BayesianRidge,LGBM,RandomForest,real,predict,notlog_predict,notlog_real
0,0.198560,0.178040,0.183594,0.000000,0.1867,0.0,0.0
1,0.084091,0.077352,0.065628,0.000000,0.0757,0.0,0.0
2,0.021215,0.093987,0.114467,0.000000,0.0766,0.0,0.0
3,0.058077,0.065376,0.065628,0.000000,0.0630,0.0,0.0
4,0.031183,0.104653,0.114467,0.000000,0.0834,0.0,0.0
...,...,...,...,...,...,...,...
83080,0.116162,0.021849,0.028720,0.000000,0.0556,0.0,0.0
83081,0.149015,0.087024,0.155382,0.000000,0.1305,0.0,0.0
83082,0.207826,0.218567,0.240943,0.000000,0.2224,0.0,0.0
83083,0.536780,0.384340,0.498938,0.526589,0.4734,1.0,1.0


In [31]:
# valid : 로그 변환
regressionSummary(result_valid.real, result_valid.predict)


Regression statistics

               Mean Error (ME) : 0.0099
Root Mean Squared Error (RMSE) : 0.2926
     Mean Absolute Error (MAE) : 0.2094


In [32]:
# valid : 로그 변환 X
regressionSummary(result_valid.notlog_real, result_valid.notlog_predict)


Regression statistics

               Mean Error (ME) : 0.2090
Root Mean Squared Error (RMSE) : 0.6091
     Mean Absolute Error (MAE) : 0.2881


### test

In [33]:
# BayesianRidge 이용해서 예측
bayes_predict = bayesianRidge.predict(test_X).tolist()
# lgbm 이용해서 예측
lgbm_predict = lgbm.predict(test_X).tolist()
# randomforest 이용해서 예측
rf_predict = rf.predict(test_X).tolist()

In [35]:
result_test = pd.DataFrame({"BayesianRidge": bayes_predict, "RandomForest": rf_predict, "LGBM": lgbm_predict, "real": test_y["18~20_ride"].tolist()})

In [36]:
total_predict = []

for i in range(len(result_test)):
    prediction = 0
    for j in range (3):
        prediction += result_test.iloc[i, j]
    prediction_mean = round(prediction / 3, 4)
    total_predict.append(prediction_mean)

In [37]:
result_test["predict"] = total_predict

In [38]:
# 로그변환해줬기 때문에 역변환 
result_test["notlog_predict"] = round(expm1(result_test.predict),0)

In [39]:
# 로그변환한거 역변환하면 왜 정수 안나오지 그냥 반올림 함
result_test["notlog_real"] = round(expm1(result_test.real), 0)

In [40]:
result_test

Unnamed: 0,BayesianRidge,RandomForest,LGBM,real,predict,notlog_predict,notlog_real
0,0.276731,0.376825,0.353493,0.959135,0.3357,0.0,2.0
1,0.125071,0.182927,0.133464,0.000000,0.1472,0.0,0.0
2,0.211983,0.178017,0.127704,0.000000,0.1726,0.0,0.0
3,0.069496,0.065628,0.079413,0.000000,0.0715,0.0,0.0
4,0.095305,0.125738,0.103107,0.000000,0.1081,0.0,0.0
...,...,...,...,...,...,...,...
83080,0.171799,0.125738,0.125514,0.000000,0.1410,0.0,0.0
83081,0.163109,0.079434,0.099646,0.000000,0.1141,0.0,0.0
83082,0.100438,0.125738,0.118522,0.000000,0.1149,0.0,0.0
83083,0.181041,0.283679,0.197747,0.000000,0.2208,0.0,0.0


In [41]:
# test : 로그 변환
regressionSummary(result_test.real, result_test.predict)


Regression statistics

               Mean Error (ME) : 0.0114
Root Mean Squared Error (RMSE) : 0.2927
     Mean Absolute Error (MAE) : 0.2097


In [42]:
# test : 로그 변환 X
regressionSummary(result_test.notlog_real, result_test.notlog_predict)


Regression statistics

               Mean Error (ME) : 0.2108
Root Mean Squared Error (RMSE) : 0.6083
     Mean Absolute Error (MAE) : 0.2890


lgbm이랑 비슷하긴 하지만 lgbm 돌렸을 때보다 ensemble 더 안좋게 나옴 ㅜㅜ(로그변환한 값으로 regression 돌렸을 때도)
우리가 로그변환했기 때문에 다시 역변환 해줘야하는데 더 rmse가 높아짐 + 실제 y값 로그변환했을 때 정수가 안나와서 실제로 맞는지 모르겠음..

## submission

In [44]:
test = pd.read_csv("C:/Users/kimch/Desktop/229255_bus_riders_at_rush_hour_data/preprocessed_test.csv")

In [45]:
test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,date,bus_route_id,station_code,station_name,latitude,longitude,6~7_ride,...,region_n,region_o,region_s,region_w,in_out_시내,in_out_시외,rain_0,rain_1,rain_2,rain_3
0,0,0,415423,2019-10-01,4270000,344,제주썬호텔,33.48990,126.49373,0.959135,...,1,0,0,0,0,1,0,0,1,0
1,1,1,415424,2019-10-01,4270000,357,한라병원,33.48944,126.48508,0.526589,...,1,0,0,0,0,1,0,0,1,0
2,2,2,415425,2019-10-01,4270000,432,정존마을,33.48181,126.47352,0.741276,...,1,0,0,0,0,1,0,0,1,0
3,3,3,415426,2019-10-01,4270000,1579,제주국제공항(600번),33.50577,126.49252,0.526589,...,1,0,0,0,1,0,0,0,1,0
4,4,4,415427,2019-10-01,4270000,1636,롯데호텔,33.24872,126.41032,0.000000,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228165,228165,228165,643588,2019-10-16,32820000,786,고산환승정류장(고산1리),33.30073,126.18044,0.000000,...,0,0,0,1,1,0,1,0,0,0
228166,228166,228166,643589,2019-10-16,32820000,1080,애월고등학교,33.46262,126.33447,0.000000,...,0,0,0,1,1,0,1,0,0,0
228167,228167,228167,643590,2019-10-16,32820000,1129,한림환승정류장(한림리),33.41437,126.26336,0.869742,...,0,0,0,1,1,0,1,0,0,0
228168,228168,228168,643591,2019-10-16,32820000,1564,제주시외버스터미널,33.49946,126.51479,0.869742,...,1,0,0,0,1,0,1,0,0,0


In [46]:
test = test.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
test.set_index("id", inplace=True)
test["date"] = pd.to_datetime(test.date, format= "%Y-%m-%d")

In [49]:
X = test[['bus_route_id','latitude','longitude', '6~7_ride','7~8_ride', '8~9_ride','9~10_ride','10~11_ride','11~12_ride','6~7_takeoff',
           '7~8_takeoff','8~9_takeoff','9~10_takeoff','10~11_takeoff','11~12_takeoff','region_e', 'region_n', 'region_o', 'region_s', 
           'region_w', 'in_out_시외', 'rain_1', 'rain_2', 'rain_3']]

In [50]:
submission = lgbm.predict(X)

In [52]:
submission_df = pd.read_csv("C:/Users/kimch/Desktop/229255_bus_riders_at_rush_hour_data/submission_sample.csv")

In [58]:
submission_df["18~20_ride"] = submission.tolist()

In [59]:
# 로그변환한 값이 18~20_ride로 들어가있음
submission_df

Unnamed: 0,id,18~20_ride
0,415423,0.906031
1,415424,1.050635
2,415425,0.721517
3,415426,1.344932
4,415427,0.084512
...,...,...
228165,643588,0.012978
228166,643589,0.008891
228167,643590,0.008708
228168,643591,0.046969


In [60]:
# 로그변환 역변환
submission_df["18~20_ride"] = round(submission_df["18~20_ride"], 0)

In [61]:
submission_df

Unnamed: 0,id,18~20_ride
0,415423,1.0
1,415424,1.0
2,415425,1.0
3,415426,1.0
4,415427,0.0
...,...,...
228165,643588,0.0
228166,643589,0.0
228167,643590,0.0
228168,643591,0.0


### PPT  
1. 프로젝트 소개
- 주제
- 변수
- 평가기준 : rmse
  
2. EDA
- 위도, 경도별 지역 시각화
- bus_route_id 시각화
- 날짜별 데이터
- 날씨
  
3. 전처리
- 지역 변수 생성
- 누적 강수량 : 결측치 처리 -> 0 으로 대체
- 요일, 휴일 변수 생성
- bus_route_id
- 범주형 : 원핫 인코딩 + 라벨 인코딩 
- 수치형 : 로그 스케일링
- 변수 선택  

4. 모델적합
- linear regression, lasso, ridge, elasticnet, decision tree regressor
- random forest regressor
- lgbm regressor
- xgboost regressor
- (앙상블)
- 최종 모델