# Kaggle competition: Bike Sharing Demand
Jens Hahn

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv('data/train.csv', parse_dates=[0])

In [None]:
X = df.drop(['count', 'casual', 'registered'], axis=1)
y = df['count']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
def extract_dates(frame):
    """extract datetime information"""
    frame['month'] = frame.iloc[:,0].dt.month
    frame['year'] = frame.iloc[:,0].dt.year
    frame['day'] = frame.iloc[:,0].dt.day
    return frame

In [None]:
def reduce_weather(frame):
    """set weather 4 to 3"""
    return frame.replace({4:3})

In [None]:
date_pipe = Pipeline([
    ('extract', ColumnTransformer(extract_dates)),
    ('ohe', OneHotEncoder())
])

weather_pipe = Pipeline([
    ('reduce', ColumnTransformer(reduce_weather)),
    ('ohe', OneHotEncoder())
])

In [None]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['season', 'weather']),
    ('bin', KBinsDiscretizer(), ['atemp', 'humidity', 'windspeed']),
    ('pass', 'passthrough', ['workingday', 'holiday'])
], remainder='drop')

In [None]:
complete_pipe = Pipeline([
    ('ct', ct),
    ('expansion', PolynomialFeatures(degree=4, interaction_only=True)),
    ('model', PoissonRegressor(alpha=10.0))
    #('model', RandomForestRegressor(max_depth=5))
])

In [None]:
complete_pipe.fit(X_train, y_train)

In [None]:
rmsle_scorer(complete_pipe, X_train, y_train)

In [None]:
def my_scorer(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred, squared=False)

In [None]:
rmsle_scorer = make_scorer(my_scorer, greater_is_better=False)

In [None]:
param_grid = {'model__alpha': [0.01, 0.1, 1.0, 10.0], 
              'expansion__degree': [2,3,4,5], 
              'expansion__interaction_only': [True, False]}

In [None]:
param_grid = {'model__alpha': [0.01]}

In [None]:
gscv = GridSearchCV(complete_pipe, param_grid=param_grid, scoring=rmsle_scorer)

In [None]:
gscv.fit(X_train, y_train)

In [None]:
gscv.best_score_