# Kaggle competition: Bike Sharing Demand
Jens Hahn

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('data/train.csv', parse_dates=[0])

In [3]:
X = df.drop(['count', 'casual', 'registered'], axis=1)
y = df['count']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [11]:
def extract_dates(frame):
    """extract datetime information"""
    frame['month'] = frame.iloc[:,0].dt.month
    frame['year'] = frame.iloc[:,0].dt.year
    frame['day'] = frame.iloc[:,0].dt.day
    return frame.iloc[:,1:]

In [12]:
def reduce_weather(frame):
    """set weather 4 to 3"""
    return frame.replace({4:3})

In [13]:
date_pipe = Pipeline([
    ('extract', FunctionTransformer(extract_dates)),
    ('ohe', OneHotEncoder())
])

weather_pipe = Pipeline([
    ('reduce', FunctionTransformer(reduce_weather)),
    ('ohe', OneHotEncoder())
])

In [14]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['season']),
    ('bin', KBinsDiscretizer(), ['atemp', 'humidity', 'windspeed']),
    ('pass', 'passthrough', ['workingday', 'holiday']),
    ('date', date_pipe, ['datetime']),
    ('weather', weather_pipe, ['weather'])
], remainder='drop')

In [17]:
complete_pipe = Pipeline([
    ('ct', ct),
    ('expansion', PolynomialFeatures(degree=4, interaction_only=True)),
    ('model', PoissonRegressor(alpha=1.0))
    #('model', RandomForestRegressor(max_depth=5))
])

In [18]:
complete_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [22]:
rmsle_scorer(complete_pipe, X_train, y_train)

-1.0099469742823384

In [20]:
def my_scorer(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred, squared=False)

In [21]:
rmsle_scorer = make_scorer(my_scorer, greater_is_better=False)

In [None]:
param_grid = {'model__alpha': [0.01, 0.1, 1.0, 10.0], 
              'expansion__degree': [2,3,4,5], 
              'expansion__interaction_only': [True, False]}

In [None]:
param_grid = {'model__alpha': [0.01]}

In [None]:
gscv = GridSearchCV(complete_pipe, 
                    param_grid=param_grid, 
                    scoring=rmsle_scorer,
                   return_train_score=True)

In [None]:
gscv.fit(X_train, y_train)

In [None]:
gscv.best_score_