# Kaggle competition: Bike Sharing Demand
Jens Hahn

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor

In [9]:
import sklearn
sklearn.__version__

'1.2.2'

In [10]:
df = pd.read_csv('data/train.csv', parse_dates=[0])

In [11]:
X = df.drop(['count', 'casual', 'registered'], axis=1)
y = df['count']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [13]:
def extract_dates(frame):
    """extract datetime information"""
    frame['month'] = frame.iloc[:,0].dt.month
    frame['year'] = frame.iloc[:,0].dt.year
    frame['day'] = frame.iloc[:,0].dt.day
    return frame.iloc[:,1:]

In [14]:
def reduce_weather(frame):
    """set weather 4 to 3"""
    return frame.replace({4:3})

In [15]:
date_pipe = Pipeline([
    ('extract', FunctionTransformer(extract_dates)),
    ('ohe', OneHotEncoder(drop='first'))
])

weather_pipe = Pipeline([
    ('reduce', FunctionTransformer(reduce_weather)),
    ('ohe', OneHotEncoder(drop='first'))
])

In [16]:
ct = ColumnTransformer([
    # ('ohe', OneHotEncoder(drop='first'), ['season']),
    ('bin', KBinsDiscretizer(), ['atemp', 'humidity', 'windspeed']),
    ('pass', 'passthrough', ['workingday', 'holiday']),
    ('date', date_pipe, ['datetime']),
    ('weather', weather_pipe, ['weather'])
], remainder='drop')

In [17]:
complete_pipe = Pipeline([
    ('ct', ct),
    ('expansion', PolynomialFeatures(degree=4, interaction_only=True)),
    ('model', PoissonRegressor(alpha=1.0))
    #('model', RandomForestRegressor(max_depth=5))
])

In [18]:
complete_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [19]:
def my_scorer(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred, squared=False)

In [20]:
rmsle_scorer = make_scorer(my_scorer, greater_is_better=False, )

In [33]:
rmsle_scorer(gscv.best_estimator_, X_train, y_train)

-0.9141177925717402

In [22]:
param_grid = {'model__alpha': [0.01, 0.1, 1.0, 10.0], 
              'expansion__degree': [2,3,4], }

#'expansion__interaction_only': [True, False]}

In [23]:
gscv = GridSearchCV(complete_pipe, 
                    param_grid=param_grid, 
                    scoring=rmsle_scorer,
                   return_train_score=True, n_jobs=8)

In [28]:
meep = gscv.best_estimator_.predict(X_train)

In [32]:
mean_squared_log_error(y_train, meep, squared=False)

0.9141177925717402

In [24]:
gscv.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATION

In [25]:
gscv.best_score_

-1.2109995846396473

In [26]:
gscv.best_params_

{'expansion__degree': 4, 'model__alpha': 0.1}