In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
df2 = pd.read_csv('model_final_data.csv', encoding='ANSI')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286706 entries, 0 to 286705
Data columns (total 46 columns):
Station_code           286706 non-null int64
index                  286706 non-null int64
Day                    286706 non-null int64
Holiday                286706 non-null int64
Typhoon                286706 non-null int64
Route_id               286706 non-null int64
Route_score            286706 non-null float64
In_Out                 286706 non-null object
Station_name           286706 non-null object
Latitude               286706 non-null float64
Longitude              286706 non-null float64
Transfer               286706 non-null int64
Airport                286706 non-null int64
Terminal               286706 non-null int64
School                 286706 non-null int64
Bus_interval           286706 non-null float64
Geton_6.9              286706 non-null int64
Geton_9.12             286706 non-null int64
Getoff_6.9             286706 non-null int64
Getoff_9.12            2

## 학습용 / 테스트용 데이터 준비

In [3]:
df2_fts = df2.drop(['index','Day','Route_id','Station_name','Station_code','Latitude','Longitude','RouteStationCode','In_Out','Week','address','Sat','station_g1','Geton_18.20'], axis = 1)
df2_trg = df2['Geton_18.20'] 

trvl_x2, ts_x2, trvl_y2, ts_y2 = train_test_split(df2_fts, df2_trg, 
                                          test_size = 0.2, shuffle = True, random_state = 1234)

## 교차검증을 사용한 그리드서치 => 최적의 parameter를 찾기 위해

**랜덤포레스트 모델 사용**

In [4]:
# 9:31
rf = RandomForestRegressor()

# Grid Search CV 를 이용해서 파라미터 튜닝과  CV 동시에 실행

random_grid = {
   
    'max_features': [32], 
    'n_estimators': [400],
    'min_samples_split': [5],
    'n_jobs' : [-1],
    'random_state':[1]
}

rfgrid = GridSearchCV(rf, random_grid, cv=3)

rfgrid.fit(trvl_x2, trvl_y2)

print("Test Set Score: {:.2f}".format(rfgrid.score(ts_x2, ts_y2)))
print("Best Cross Validation Score: {:.2f}".format(rfgrid.best_score_))
print("Best Parameter:", rfgrid.best_params_)
print("Best Model:\n", rfgrid.best_estimator_)

Test Set Score: 0.72
Best Cross Validation Score: 0.67
Best Parameter: {'max_features': 32, 'min_samples_split': 5, 'n_estimators': 400, 'n_jobs': -1, 'random_state': 1}
Best Model:
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=32, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=400, n_jobs=-1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)


## 변수 중요도 평가

In [5]:
importance2 = pd.DataFrame({'feature' : trvl_x2.columns, 'importance' : rfgrid.best_estimator_.feature_importances_}).sort_values(by='importance', ascending=False)
importance2

Unnamed: 0,feature,importance
9,Geton_9.12,0.332561
12,Geton_total,0.08357
7,Bus_interval,0.080239
15,Type_general_geton,0.058786
14,Commuter_count,0.054285
2,Route_score,0.05391
20,station_g3,0.043881
16,Type_others_geton,0.038528
17,Type_general_getoff,0.031739
28,price_up,0.029282


## metrics

In [6]:
y_hat2 = rfgrid.best_estimator_.predict(ts_x2)

mae2 = metrics.mean_absolute_error(ts_y2, y_hat2)
mse2 = metrics.mean_squared_error(ts_y2, y_hat2)
rmse2 = np.sqrt(mse2)

print('mean absolute error : ', mae2)
print('mean squared error : ', mse2)
print('root mean squared error : ', rmse2)

mean absolute error :  1.2491808564306681
mean squared error :  8.795146706376606
root mean squared error :  2.965661259546782
