In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from my_transformer import change_to_datetime, divide_datetime, add_dayofweek, feature_selection, one_hot_encoding, standard_scaler, concat, drop_feature

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [4]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')

In [5]:
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)

In [6]:
from sklearn.metrics import make_scorer

def neg_rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference) * (-1)
    
    return score

neg_rmsle_scorer = make_scorer(neg_rmsle)

In [7]:
preparation = joblib.load("preparation.pkl")
pipeline_cat = joblib.load("pipeline_cat.pkl")
pipeline_num = joblib.load("pipeline_num.pkl")

In [8]:
train = pd.read_csv('train.csv')
# train = pd.read_csv("data/train.csv", parse_dates=["datetime"]) 와 같이 불러오면 data type을 변경할 필요 없음
test = pd.read_csv('test.csv')

In [9]:
data = train.drop(['casual', 'registered', 'count'], axis=1)
y = train['count']

In [10]:
data = preparation.transform(data)
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,2011,1,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,2011,1,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,2011,1,1,4,5


In [13]:
feature_cat = ['holiday', 'workingday', 'weather', 'year', 'month', 'hour', 'dayofweek']
feature_num = ['temp', 'humidity', 'windspeed']

In [14]:
pipeline_cat = make_pipeline(
    feature_selection(feature_cat),
    one_hot_encoding('weather', 'weather'),
    one_hot_encoding('month', 'month'),
    one_hot_encoding('hour', 'hour'),
)
joblib.dump(pipeline_cat, "pipeline_cat.pkl")

['pipeline_cat.pkl']

In [15]:
X_cat = pipeline_cat.transform(data)
X_num = pipeline_num.transform(data)
X = concat(X_cat, X_num)

In [16]:
remove = ['dayofweek', 'weather_4']
drop = drop_feature(remove)
X = drop.transform(X)

In [17]:
rf= RandomForestRegressor(max_depth=24, n_estimators=899, random_state=30)
score = cross_val_score(rf, X, y, cv=5, scoring=rmsle_scorer).mean()

In [18]:
score

0.6798837541241446

In [19]:
rf.fit(X, y)
feature_importances = rf.feature_importances_
sorted(zip(feature_importances, X.columns), reverse=True)

[(0.1595489385395321, 'temp'),
 (0.12710873079745932, 'humidity'),
 (0.09907618742130062, 'workingday'),
 (0.09441566970197017, 'hour_17'),
 (0.08121552204198251, 'year'),
 (0.07876469144589714, 'hour_18'),
 (0.06365159665640625, 'hour_8'),
 (0.03130218081613901, 'hour_19'),
 (0.018097070875624677, 'hour_7'),
 (0.015807091613066458, 'hour_1'),
 (0.015404101173008285, 'hour_2'),
 (0.014662714563664397, 'windspeed'),
 (0.014521295624794272, 'hour_16'),
 (0.01428692074992563, 'hour_3'),
 (0.014179280380778807, 'hour_0'),
 (0.01384389766514432, 'hour_4'),
 (0.012620725674955617, 'hour_5'),
 (0.011675070211336774, 'hour_20'),
 (0.011260930371133028, 'hour_9'),
 (0.01034980174280633, 'hour_23'),
 (0.01006396597139611, 'weather_3'),
 (0.007444439934953635, 'hour_6'),
 (0.006531878691290068, 'hour_22'),
 (0.005913571297244248, 'hour_21'),
 (0.0055051645471715155, 'month_11'),
 (0.0052722099507864885, 'month_10'),
 (0.005218824807903771, 'month_12'),
 (0.00476411619841305, 'hour_10'),
 (0.00405