In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [4]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')

In [5]:
class change_to_datetime(BaseEstimator, TransformerMixin):
    def __init__(self, column_name=None):
        self.column_name = column_name
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        df_X[self.column_name] = pd.to_datetime(df_X[self.column_name])
        return df_X

In [6]:
class divide_datetime(BaseEstimator, TransformerMixin):       
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        df_X["year"] = df_X["datetime"].dt.year
        df_X["month"] = df_X["datetime"].dt.month
        df_X["day"] = df_X["datetime"].dt.day
        df_X["hour"] = df_X["datetime"].dt.hour
        return df_X

In [7]:
class add_dayofweek(BaseEstimator, TransformerMixin):       
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        df_X["dayofweek"] = df_X["datetime"].dt.dayofweek
        return df_X

In [8]:
class one_hot_encoding(BaseEstimator, TransformerMixin):
    def __init__(self, column_name=None, prefix=None):
        self.column_name = column_name
        self.prefix = prefix
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        onehotencoding = pd.get_dummies(df_X[self.column_name], prefix = self.prefix)
        df_X.drop(self.column_name, axis=1, inplace=True)
        return pd.concat([df_X, onehotencoding], axis=1)

In [9]:
class feature_selection(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        return df_X[self.columns]

In [10]:
class standard_scaler(BaseEstimator, TransformerMixin):      
    def __init__(self, column_name=None, prefix=None):
        self.column_name = column_name
        self.prefix = prefix
        
    def fit(self, df_X, y=None):
        return self
    
    def transform(self, df_X):
        scaler = StandardScaler()
        scaler.fit(df_X)
        X = scaler.transform(df_X)
        df_X = pd.DataFrame(X, columns=df_X.columns, index=df_X.index)
        return df_X

In [11]:
def concat(df_A, df_B) : 
    return pd.concat([df_A, df_B], axis=1)

In [12]:
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    # 넘파이로 배열 형태로 바꿔준다.
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제 값에 1을 더하고 로그를 씌워준다.
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 해준다.
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균을 낸다.
    mean_difference = difference.mean()
    
    # 다시 루트를 씌운다.
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

make_scorer(rmsle)

In [13]:
train = pd.read_csv('train.csv')
# train = pd.read_csv("data/train.csv", parse_dates=["datetime"]) 와 같이 불러오면 data type을 변경할 필요 없음
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [14]:
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [15]:
data = train.drop(['casual', 'registered', 'count'], axis=1)
y = train['count']
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0


In [16]:
y.head()

0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

In [17]:
preparation = make_pipeline(
    change_to_datetime('datetime'),
    divide_datetime(),
    add_dayofweek()
)

In [18]:
joblib.dump(preparation, "preparation.pkl")

['preparation.pkl']

In [19]:
data = preparation.fit_transform(data)
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,2011,1,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,2011,1,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,2011,1,1,4,5


- season과 month는 같이 사용 X
- temp와 atemp는 같이 사용 X

In [20]:
feature_cat = ['holiday', 'workingday', 'weather', 'year', 'month', 'hour', 'dayofweek']
feature_num = ['temp', 'humidity', 'windspeed']

In [21]:
pipeline_cat = make_pipeline(
    feature_selection(feature_cat),
    one_hot_encoding('weather', 'weather'),
    one_hot_encoding('month', 'month'),
    one_hot_encoding('hour', 'hour'),
    one_hot_encoding('dayofweek', 'dayofweek')
)

In [22]:
pipeline_num = make_pipeline(
    feature_selection(feature_num),
    standard_scaler()
)

In [23]:
joblib.dump(pipeline_cat, "pipeline_cat.pkl")
joblib.dump(pipeline_num, "pipeline_num.pkl")

['pipeline_num.pkl']

In [24]:
X_cat = pipeline_cat.transform(data)
X_num = pipeline_num.transform(data)

In [25]:
X_cat.head()

Unnamed: 0,holiday,workingday,year,weather_1,weather_2,weather_3,weather_4,month_1,month_2,month_3,...,hour_21,hour_22,hour_23,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
0,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [26]:
X_num.head()

Unnamed: 0,temp,humidity,windspeed
0,-1.333661,0.993213,-1.567754
1,-1.438907,0.941249,-1.567754
2,-1.438907,0.941249,-1.567754
3,-1.333661,0.68143,-1.567754
4,-1.333661,0.68143,-1.567754


In [27]:
X = concat(X_cat, X_num)
X.head()

Unnamed: 0,holiday,workingday,year,weather_1,weather_2,weather_3,weather_4,month_1,month_2,month_3,...,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,temp,humidity,windspeed
0,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,-1.333661,0.993213,-1.567754
1,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,-1.438907,0.941249,-1.567754
2,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,-1.438907,0.941249,-1.567754
3,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,-1.333661,0.68143,-1.567754
4,0,0,2011,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,-1.333661,0.68143,-1.567754


full_pipeline = make_column_transformer(
    (one_hot_encoding('weather', 'weather'), ['weather']),
    (one_hot_encoding('month', 'month'), ['month']),
    (one_hot_encoding('hour', 'hour'), ['hour']),
    (one_hot_encoding('dayofweek', 'dayofweek'), ['dayofweek']),
    (standard_scaler(), feature_num)
)

X = full_pipeline.fit_transform(X)

In [28]:
knn = KNeighborsRegressor(n_neighbors=5)
linear = LinearRegression()
sgd = SGDRegressor(max_iter=None, eta0=0.01, penalty='l2', random_state=30)
ridge = Ridge(alpha=1, random_state=30)
lasso = Lasso(alpha=1, random_state=30)
elastic = ElasticNet(alpha=1, l1_ratio=0.5, random_state=30)
dt = DecisionTreeRegressor(max_depth=None, random_state=30)
rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=30, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=30)
svm = SVR(C=1, kernel='rbf', gamma='auto_deprecated')

In [29]:
models = {
    'knn' : knn,
    'linear' : linear,
    'sgd' : sgd,
    'ridge' : ridge,
    'lasso' : lasso,
    'elastic' : elastic,
    'dt' : dt,
    'rf' : rf,
    'gb' : gb,
    'svm' : svm
}

In [30]:
scores = dict()

for key, model in models.items() : 
    score = cross_val_score(model, X, y, cv=5, scoring=rmsle_scorer).mean()
    
    scores[key] = score

In [31]:
scores

{'knn': 0.9424700305636879,
 'linear': nan,
 'sgd': nan,
 'ridge': nan,
 'lasso': nan,
 'elastic': 1.4115015619514282,
 'dt': 0.7360289795958088,
 'rf': 0.669604824723618,
 'gb': nan,
 'svm': 1.2925824600739628}