# Importation datas

In [209]:
import calendar
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from datetime import datetime
from scipy import stats

from vivadata.datasets.common import get_path_for_dataset

%matplotlib inline
sns.set()

In [210]:
base_path = get_path_for_dataset('bike-sharing/train.csv')
base_path

'/home/fred/code/fred/vivadata-curriculum/cache/datasets/bike-sharing/train.csv'

In [211]:
df = pd.read_csv(os.path.join(base_path))

In [212]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## Suppression des colonnes 'casual' et 'registered'

In [213]:
drop_lst = ['casual', 'registered']
df = df.drop(drop_lst, axis=1)

## Datetime

In [214]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [215]:
df['dow'] = df['datetime'].dt.dayofweek

In [216]:
df['month'] = df['datetime'].dt.month

In [217]:
df['week'] = df['datetime'].dt.week

In [218]:
df['hour'] = df['datetime'].dt.hour

In [219]:
df['year'] = df['datetime'].dt.year

In [220]:
df['day'] = df['datetime'].dt.day

In [221]:
df = df.set_index(df['datetime'])

In [222]:
df = df.drop(labels='datetime', axis=1)

## Suppression de la colonne 'Atemp'

In [223]:
df = df.drop(labels='atemp', axis=1)

## Séparation de la colonne weather en 4 parties

In [224]:
df = pd.get_dummies(df, columns=['weather'])

In [225]:
df = df.drop(labels='weather_4', axis=1)

## Création de features par multiplication de la température et de chaque colonne weather

In [226]:
df['temp_weath_1'] = df['temp'] * df['weather_1']
df['temp_weath_2'] = df['temp'] * df['weather_2']
df['temp_weath_3'] = df['temp'] * df['weather_3']

## #######TRIES

In [227]:
df.head()

Unnamed: 0_level_0,season,holiday,workingday,temp,humidity,windspeed,count,dow,month,week,hour,year,day,weather_1,weather_2,weather_3,temp_weath_1,temp_weath_2,temp_weath_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-01 00:00:00,1,0,0,9.84,81,0.0,16,5,1,52,0,2011,1,1,0,0,9.84,0.0,0.0
2011-01-01 01:00:00,1,0,0,9.02,80,0.0,40,5,1,52,1,2011,1,1,0,0,9.02,0.0,0.0
2011-01-01 02:00:00,1,0,0,9.02,80,0.0,32,5,1,52,2,2011,1,1,0,0,9.02,0.0,0.0
2011-01-01 03:00:00,1,0,0,9.84,75,0.0,13,5,1,52,3,2011,1,1,0,0,9.84,0.0,0.0
2011-01-01 04:00:00,1,0,0,9.84,75,0.0,1,5,1,52,4,2011,1,1,0,0,9.84,0.0,0.0


# ---------Machine Learning

In [163]:
X = df.loc[:, df.columns != 'count']
y = np.log(df['count'])

In [164]:
X.shape, y.shape

((10886, 25), (10886,))

In [165]:
from sklearn.model_selection import train_test_split

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [167]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8708, 25), (8708,), (2178, 25), (2178,))

In [168]:
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Normalizer, minmax_scale, QuantileTransformer, RobustScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_score

from xgboost import XGBRegressor

In [169]:
pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('LASSO', Lasso(random_state=42))])))
pipelines.append(('ScaledRID', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RID', Ridge(random_state=42))])))
pipelines.append(('ScaledKNN', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor(n_neighbors=2))])))
pipelines.append(('ScaledCART', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor(random_state=42))])))
pipelines.append(('ScaledGBM', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor(random_state=42))])))
pipelines.append(('ScaledRFR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RFR', RandomForestRegressor(random_state=42))])))
pipelines.append(('ScaledSVR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('SVR', SVR(kernel='linear'))])))
pipelines.append(('ScaledXGBR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('XGBR', XGBRegressor(random_state=42))])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(random_state=42)
    cv_results = -cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_log_error')
    results.append(np.sqrt(cv_results))
    names.append(name)
    msg = "{}: {} ({})".format(name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: 0.14493978897656898 (0.10575543428240144)
ScaledLASSO: 0.12823473176777392 (0.00552164797667312)
ScaledRID: 0.0625576503948737 (0.0031399107734091125)
ScaledKNN: 0.0731972502662987 (0.0005841063190971454)
ScaledCART: 0.027729320969044974 (0.0018015026067975126)
ScaledGBM: 0.01490763301214106 (0.0010956905865527344)
ScaledRFR: 0.01577991009677268 (0.0010029539418614781)
ScaledSVR: 0.06387134238252624 (0.0029259855687778584)
ScaledXGBR: 0.014787204652752342 (0.0006454313402913859)


In [27]:
# from sklearn.model_selection import GridSearchCV

# poly = PolynomialFeatures().fit_transform(X_train)
# rescaledX = StandardScaler().fit_transform(poly)
# parameters = {
#     'learning_rate': [0.1, 0.3, 0.5, 0.7], #[0, 1] 0.3
#     'max_depth': [6, 7, 8], #[0, inf] 6
#     'min_child_weight': [3, 4, 5], #[0, inf] 1
#     'subsample': [0.6, 0.7, 0.8] #[0, 1] 1
# }
# #colsample_bytree=0.7, learning_rate=0.05, max_depth=7, min_child_weight=4, subsample=0.7
# model = XGBRegressor(random_state=42)
# kfold = KFold(random_state=42)
# grid = GridSearchCV(estimator=model, param_grid=parameters, scoring='neg_mean_squared_log_error', cv=kfold)
# grid_result = grid.fit(rescaledX, y_train)

# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(-means, stds, params):
#     print("{} ({}) with: {}".format(mean, stdev, param))

# print("Best: {} using {}".format(grid_result.best_score_, grid_result.best_params_))

# -----------------------------------------------------------------------------------------------------

# -----------------------------------------------TEST-----------------------------------------------

# -----------------------------------------------------------------------------------------------------

In [186]:
base_path = get_path_for_dataset('bike-sharing/test.csv')
base_path

'/home/fred/code/fred/vivadata-curriculum/cache/datasets/bike-sharing/test.csv'

In [187]:
df_test = pd.read_csv(os.path.join(base_path))

In [188]:
df_test['datetime'] = pd.to_datetime(df_test['datetime'])

In [189]:
df_test['dow'] = df_test['datetime'].dt.dayofweek

In [190]:
df_test['month'] = df_test['datetime'].dt.month

In [191]:
df_test['week'] = df_test['datetime'].dt.week

In [192]:
df_test['hour'] = df_test['datetime'].dt.hour

In [193]:
df_test['year'] = df_test['datetime'].dt.year

In [194]:
df_test['day'] = df_test['datetime'].dt.day

In [195]:
df_test = df_test.set_index(df_test['datetime'])

In [196]:
df_test = df_test.drop(labels='datetime', axis=1)

In [197]:
df_test = df_test.drop(labels='atemp', axis=1)

In [198]:
df_test = pd.get_dummies(df_test, columns=['weather'])

In [199]:
df_test = df_test.drop(labels='weather_4', axis=1)

In [200]:
df_test['temp_weath_1'] = df_test['temp'] * df_test['weather_1']
df_test['temp_weath_2'] = df_test['temp'] * df_test['weather_2']
df_test['temp_weath_3'] = df_test['temp'] * df_test['weather_3']

In [203]:
standardscaler = StandardScaler()
#model = GradientBoostingRegressor()
model = XGBRegressor(colsample_bytree=0.7, learning_rate=0.05, max_depth=7, min_child_weight=4, subsample=0.7, random_state=42)

In [204]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=4, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [205]:
model.predict(df_test)

array([2.6056697, 1.6949611, 1.4231635, ..., 4.674358 , 4.40491  ,
       3.878468 ], dtype=float32)

In [206]:
pipe = Pipeline([('poly', PolynomialFeatures()), ('StandardScaler', standardscaler), ('XGBR', model)])
pipe.fit(X_train, y_train)
y_pred = np.exp(pipe.predict(df_test))
y_pred

array([ 12.859673 ,   4.528692 ,   3.5026782, ..., 133.42244  ,
        98.83615  ,  62.710434 ], dtype=float32)

In [207]:
df_sample_submission = pd.DataFrame({'datetime': df_test.index, 'count': y_pred})

In [208]:
sample_submission = pd.DataFrame.to_csv(df_sample_submission, index=False)
with open ('../submissions/sample_submission.csv', 'w') as f:
    f.write(sample_submission)

**Score: 0.40163**

**244/3251**