# Kaggle competition: Bike Sharing Demand
Jens Hahn

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, KBinsDiscretizer, FunctionTransformer, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv('data/train.csv', parse_dates=[0])

In [None]:
X = df.drop(['count', 'casual', 'registered'], axis=1)
y = df['count']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
def extract_dates(frame):
    """extract datetime information"""
    frame['month'] = frame.iloc[:,0].dt.month
    frame['year'] = frame.iloc[:,0].dt.year
    frame['hour'] = frame.iloc[:,0].dt.hour
    return frame.iloc[:,1:]

In [None]:
def reduce_weather(frame):
    """set weather 4 to 3"""
    return frame.replace({4:3})

In [None]:
date_pipe = Pipeline([
    ('extract', FunctionTransformer(extract_dates)),
    ('ohe', OneHotEncoder(drop='first'))
])

weather_pipe = Pipeline([
    ('reduce', FunctionTransformer(reduce_weather)),
    ('ohe', OneHotEncoder(drop='first'))
])

wind_pipe = Pipeline([
    ('transform', PowerTransformer(method='yeo-johnson')),
    ('bins', KBinsDiscretizer(n_bins=10))
])

In [None]:
ct = ColumnTransformer([
    # ('ohe', OneHotEncoder(drop='first'), ['season']),
    ('wind', wind_pipe, ['windspeed']),
    ('bin', KBinsDiscretizer(n_bins=10), ['atemp', 'humidity']),
    ('pass', 'passthrough', ['workingday', 'holiday']),
    ('date', date_pipe, ['datetime']),
    ('weather', weather_pipe, ['weather'])
], remainder='drop')

In [None]:
complete_pipe = Pipeline([
    ('ct', ct),
    ('expansion', PolynomialFeatures(degree=4, interaction_only=True)),
    ('model', PoissonRegressor(alpha=0.2))
    #('model', RandomForestRegressor(max_depth=5))
])

In [None]:
complete_pipe.fit(X_train, y_train)

In [None]:
def my_scorer(y_true, y_pred, **kwargs):
    return mean_squared_log_error(y_true, y_pred, squared=False, **kwargs)

In [None]:
rmsle_scorer = make_scorer(my_scorer, greater_is_better=False, )

In [None]:
rmsle_scorer(complete_pipe, X_train, y_train)

In [None]:
param_grid = {'model__alpha': [0.01, 0.1, 1.0, 10.0], 
              'expansion__degree': [2,3,4], 
              'expansion__interaction_only': [True, False]}

In [None]:
gscv = GridSearchCV(complete_pipe, 
                    param_grid=param_grid, 
                    scoring=rmsle_scorer,
                   return_train_score=True, n_jobs=8)

In [None]:
gscv.fit(X_train, y_train)

In [None]:
gscv.best_score_

In [None]:
gscv.best_params_

## Kaggle prediction

In [None]:
df_kaggle = pd.read_csv('data/test.csv', parse_dates=[0])

In [None]:
X_kaggle = df_kaggle.copy()

In [None]:
y_kaggle = complete_pipe.predict(X_kaggle)

In [None]:
y_kaggle = gscv.best_estimator_.predict(X_kaggle)

In [None]:
sol = pd.DataFrame(y_kaggle, columns=['count'], index=df_kaggle['datetime'])

In [None]:
sol.head()

In [None]:
sol.to_csv('solution.csv')