In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from my_transformer import change_to_datetime, divide_datetime, add_dayofweek, feature_selection, one_hot_encoding, standard_scaler, concat, drop_feature

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [4]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')

In [5]:
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)

In [6]:
from sklearn.metrics import make_scorer

def neg_rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference) * (-1)
    
    return score

neg_rmsle_scorer = make_scorer(neg_rmsle)

In [7]:
preparation = joblib.load("preparation.pkl")
pipeline_cat = joblib.load("pipeline_cat.pkl")
pipeline_num = joblib.load("pipeline_num.pkl")

In [8]:
train = pd.read_csv('train.csv')
# train = pd.read_csv("data/train.csv", parse_dates=["datetime"]) 와 같이 불러오면 data type을 변경할 필요 없음
test = pd.read_csv('test.csv')

In [9]:
data = train.drop(['casual', 'registered', 'count'], axis=1)
y = train['count']

In [10]:
data = preparation.transform(data)
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,2011,1,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,2011,1,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,2011,1,1,4,5


In [11]:
feature_cat = ['holiday', 'workingday', 'weather', 'year', 'month', 'hour', 'dayofweek']
feature_num = ['temp', 'humidity', 'windspeed']

In [12]:
X_cat = pipeline_cat.transform(data)
X_num = pipeline_num.transform(data)
X = concat(X_cat, X_num)

In [13]:
param = {
        'n_estimators' : stats.randint(low=1, high=1000),
        'max_depth' : stats.randint(low=1, high=30)
}

rf= RandomForestRegressor(random_state=30)
search = RandomizedSearchCV(rf, param_distributions=param, n_iter=10, cv=5, scoring=neg_rmsle_scorer, random_state=30, n_jobs=-1)
search.fit(X, y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=30, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019AEBE0EF28>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019AEBE240B8>},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          return_train_score='warn', scoring=make_scorer(neg_rmsle),
          verbose=0)

In [14]:
search.best_params_

{'max_depth': 24, 'n_estimators': 899}

In [16]:
result = search.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print('{} : {:.4f}' .format(params, abs(mean_score)))

{'max_depth': 6, 'n_estimators': 422} : 1.3487
{'max_depth': 14, 'n_estimators': 814} : 0.8828
{'max_depth': 21, 'n_estimators': 141} : 0.7082
{'max_depth': 24, 'n_estimators': 899} : 0.6840
{'max_depth': 22, 'n_estimators': 764} : 0.6979
{'max_depth': 18, 'n_estimators': 431} : 0.7569
{'max_depth': 4, 'n_estimators': 426} : 1.4424
{'max_depth': 8, 'n_estimators': 888} : 1.2464
{'max_depth': 2, 'n_estimators': 690} : 1.5226
{'max_depth': 14, 'n_estimators': 446} : 0.8836


In [17]:
feature_importances = search.best_estimator_.feature_importances_
sorted(zip(feature_importances, X.columns), reverse=True)

[(0.15742381033571048, 'temp'),
 (0.12485890508210071, 'humidity'),
 (0.0955936147621362, 'workingday'),
 (0.0943242406451773, 'hour_17'),
 (0.08074016806683793, 'year'),
 (0.07868086491838505, 'hour_18'),
 (0.06335967570837714, 'hour_8'),
 (0.031156830495310744, 'hour_19'),
 (0.01748325941933721, 'hour_7'),
 (0.015709825015631947, 'hour_1'),
 (0.015305625446356944, 'hour_2'),
 (0.014442305900229898, 'hour_16'),
 (0.014191986134753306, 'hour_3'),
 (0.014077295371444762, 'hour_0'),
 (0.013748440226626243, 'hour_4'),
 (0.012510992734035302, 'hour_5'),
 (0.012319034502139056, 'windspeed'),
 (0.011303837170929455, 'hour_20'),
 (0.011032993959880747, 'hour_9'),
 (0.010297024163384474, 'hour_23'),
 (0.009696930824612005, 'weather_3'),
 (0.007286785111055465, 'hour_6'),
 (0.00634976642278545, 'hour_22'),
 (0.005816175420065543, 'hour_21'),
 (0.005325244258616551, 'month_11'),
 (0.005029750873118724, 'month_12'),
 (0.004768415575268778, 'month_10'),
 (0.004684022580792121, 'hour_10'),
 (0.0043