**Predict the number of bikes available.**

# IMPORTS

## Standards

In [1]:
from pathlib import Path

## Externals

In [2]:
import pandas as pd
import sklearn as sk
import plotly.io as pio
import plotly.express as px
from sklearn import compose, ensemble, metrics, model_selection, pipeline, preprocessing

# CONFIGS

## Seeds

In [3]:
RANDOM = 42

## Folders

In [4]:
ROOT = Path("../")
DATA = str(ROOT / "data")
CACHE  = str(ROOT / ".cache")

## Columns

In [5]:
INDEX = "instant"
TARGET = "cnt"

## Datasets

In [6]:
HOUR = f"{DATA}/hour.csv"

## Splittings

In [7]:
SPLITS = 4
SHUFFLE = False  # required (time sensitive)
TEST_SIZE = 24 * 30 * 2 # use 2 months for backtesting

## Searchings

In [8]:
SCORING = "neg_mean_squared_error"
PARAM_GRID = {
    "regressor__max_depth": [15, 20, 25],
    "regressor__n_estimators": [150, 200, 250],
}

# OPTIONS

## Pandas

In [9]:
# display all rows/columns
pd.options.display.max_rows = None
pd.options.display.max_columns = None

## Plotly

In [10]:
# change the default theme
pio.templates.default = "plotly_dark"

## Sklearn

In [11]:
# force sklearn return pd.DataFrame
sk.set_config(transform_output="pandas")

# DATASETS

## Hour

In [12]:
hour = pd.read_csv(HOUR, index_col=INDEX)
print("Hour:", hour.shape)
hour.head()

Hour: (17379, 16)


Unnamed: 0_level_0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [13]:
hour.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17379 entries, 1 to 17379
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      17379 non-null  object 
 1   season      17379 non-null  int64  
 2   yr          17379 non-null  int64  
 3   mnth        17379 non-null  int64  
 4   hr          17379 non-null  int64  
 5   holiday     17379 non-null  int64  
 6   weekday     17379 non-null  int64  
 7   workingday  17379 non-null  int64  
 8   weathersit  17379 non-null  int64  
 9   temp        17379 non-null  float64
 10  atemp       17379 non-null  float64
 11  hum         17379 non-null  float64
 12  windspeed   17379 non-null  float64
 13  casual      17379 non-null  int64  
 14  registered  17379 non-null  int64  
 15  cnt         17379 non-null  int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 2.3+ MB


# ANALYSIS

## Tables

In [14]:
hour.describe(include="all")

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
unique,731,,,,,,,,,,,,,,,
top,2011-01-01,,,,,,,,,,,,,,,
freq,24,,,,,,,,,,,,,,,
mean,,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0


## Figures

In [15]:
px.scatter_matrix(
    hour, dimensions=["registered", "casual", "cnt", "mnth", "hr"], color=TARGET,
    height=800, title="Analysis of top features",
)

# SUBSETS

## Columns

In [16]:
inputs, targets = hour.drop(columns=TARGET), hour[TARGET]
print('Inputs:', inputs.shape, '; Targets:', targets.shape)

Inputs: (17379, 15) ; Targets: (17379,)


## Rows

In [17]:
inputs_train, inputs_test, targets_train, targets_test = model_selection.train_test_split(
    inputs, targets, shuffle=SHUFFLE, test_size=TEST_SIZE, random_state=RANDOM
)
print("[TRAIN] Inputs:", inputs_train.shape, "; Targets:", targets_train.shape)
print("[TEST] Inputs:", inputs_test.shape, "; Targets:", targets_test.shape)

[TRAIN] Inputs: (15939, 15) ; Targets: (15939,)
[TEST] Inputs: (1440, 15) ; Targets: (1440,)


In [18]:
assert inputs_train.index.max() < inputs_test.index.min(), "Inputs train should be before inputs test"
assert targets_train.index.max() < targets_test.index.min(), "Targets train should be before targets test"

# MODELS

## Features

In [19]:
categoricals = [
    "season",
    "weathersit",
]
assert all(col in inputs.columns for col in categoricals), "All categorical columns should be in inputs."

In [20]:
numericals = [
    "yr",
    "mnth",
    "hr",
    "holiday",
    "weekday",
    "workingday",
    "temp",
    "atemp",
    "hum",
    "windspeed",
    "casual",
    # "registered", # too correlated with target
]
assert all(col in inputs.columns for col in numericals), "All numerical columns should be in inputs."

In [21]:
assert not (set(categoricals) & set(numericals)), "Feature columns should not overlap."

## Pipelines

In [22]:
draft = pipeline.Pipeline(
    steps=[
        ("transformer", compose.ColumnTransformer([
            ("categoricals", preprocessing.OneHotEncoder(
                sparse_output=False, handle_unknown="ignore"
            ), categoricals),
            ("numericals", "passthrough", numericals),
        ], remainder="drop")),
        ("regressor", ensemble.RandomForestRegressor(random_state=RANDOM)),
    ],
    memory=CACHE,
)
draft

# TUNING

## Splits

In [23]:
splitter = model_selection.TimeSeriesSplit(n_splits=SPLITS, test_size=TEST_SIZE)
for train_index, test_index in splitter.split(inputs_train): # test splitter generation
    print(f"Train: {train_index.min()} - {train_index.max()}; Test: {test_index.min()} - {test_index.max()}")

Train: 0 - 10178; Test: 10179 - 11618
Train: 0 - 11618; Test: 11619 - 13058
Train: 0 - 13058; Test: 13059 - 14498
Train: 0 - 14498; Test: 14499 - 15938


## Search

In [24]:
search = model_selection.GridSearchCV(
    estimator=draft, cv=splitter, param_grid=PARAM_GRID, scoring=SCORING, verbose=1
)
search.fit(inputs_train, targets_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits



Persisting input arguments took 1.02s to run.If this happens often in your code, it can cause performance problems (results will be correct in all cases). The reason for this is probably some large input arguments for a wrapped function.



## Results

In [25]:
results = pd.DataFrame(search.cv_results_)
results = results.sort_values(by="rank_test_score")
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__max_depth,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
4,9.851526,2.065956,0.07121,0.029418,20,200,"{'regressor__max_depth': 20, 'regressor__n_est...",-8284.760118,-2926.384331,-1723.714026,-3418.334584,-4088.298265,2500.022302,1
1,6.918022,0.88634,0.043904,0.00506,15,200,"{'regressor__max_depth': 15, 'regressor__n_est...",-8269.045119,-2919.790679,-1716.194466,-3461.907597,-4091.734465,2493.153059,2
5,11.964708,1.272631,0.07448,0.013998,20,250,"{'regressor__max_depth': 20, 'regressor__n_est...",-8315.603394,-2950.517447,-1711.342231,-3415.194453,-4098.164381,2513.327138,3
7,8.290697,1.105596,0.049447,0.004403,25,200,"{'regressor__max_depth': 25, 'regressor__n_est...",-8338.472693,-2934.279157,-1709.411233,-3440.693101,-4105.714046,2523.540444,4
2,8.716824,1.402426,0.05511,0.008181,15,250,"{'regressor__max_depth': 15, 'regressor__n_est...",-8302.60278,-2948.133556,-1712.543116,-3464.4694,-4106.937213,2504.611918,5


# TRAINING

## Final

In [26]:
final = search.best_estimator_
final

## Params

In [27]:
print(final.get_params())

{'memory': '../.cache', 'steps': [('transformer', ColumnTransformer(transformers=[('categoricals',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse_output=False),
                                 ['season', 'weathersit']),
                                ('numericals', 'passthrough',
                                 ['yr', 'mnth', 'hr', 'holiday', 'weekday',
                                  'workingday', 'temp', 'atemp', 'hum',
                                  'windspeed', 'casual'])])), ('regressor', RandomForestRegressor(max_depth=20, n_estimators=200, random_state=42))], 'verbose': False, 'transformer': ColumnTransformer(transformers=[('categoricals',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse_output=False),
                                 ['season', 'weathersit']),
                                ('numericals', 'passthrou

# INFERENCE

## Predictions

In [28]:
predictions = pd.Series(final.predict(inputs_test), index=inputs_test.index)
print(predictions.shape)
predictions.head()

(1440,)


instant
15940    236.873333
15941    281.488333
15942    208.511500
15943    110.450000
15944     63.895000
dtype: float64

## Statistics

In [29]:
predictions.describe()

count    1440.000000
mean      189.024203
std       167.663745
min         3.172096
25%        47.128750
50%       153.216954
75%       275.609534
max       801.500000
dtype: float64

# EVALUATION

## Rank

In [30]:
px.line(results, x="rank_test_score", y="mean_test_score", title="Rank by test score")

## Params

In [31]:
dimensions = [col for col in results.columns if col.startswith("param_")]
px.parallel_categories(results, dimensions=dimensions, color="mean_test_score", title="Params by test score")

## Predictions

In [32]:
score = metrics.mean_squared_error(targets_test, predictions)
score

4706.147416021958

## Prediction errors

In [33]:
errors = pd.concat([targets_test, predictions], axis="columns", keys=["targets", "predictions"])
errors["error"] = errors["targets"] - errors["predictions"]
errors.head()

Unnamed: 0_level_0,targets,predictions,error
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15940,296,236.873333,59.126667
15941,267,281.488333,-14.488333
15942,202,208.5115,-6.5115
15943,120,110.45,9.55
15944,50,63.895,-13.895


In [34]:
px.histogram(errors, x="error", title="Distribution of errors")

## Features importances

In [35]:
importances = pd.Series(
    final.named_steps["regressor"].feature_importances_,
    index=final[:-1].get_feature_names_out(),
).sort_values(ascending=False)
print(importances.shape)
importances.head()

(19,)


numericals__casual        0.593856
numericals__hr            0.239171
numericals__workingday    0.070006
numericals__yr            0.050373
numericals__mnth          0.013183
dtype: float64

In [36]:
px.bar(importances, title="Feature importances")

# DIAGNOSTIC

## Learning Curve

In [37]:
train_size, train_scores, test_scores = model_selection.learning_curve(
    final, inputs, targets, cv=splitter, scoring=SCORING, random_state=RANDOM,
)
learning = pd.DataFrame(
    {
        "train_size": train_size,
        "mean_test_score": test_scores.mean(axis=1),
        "mean_train_score": train_scores.mean(axis=1),
    }
)
px.line(learning, x="train_size", y=["mean_test_score", "mean_train_score"], title="Learning Curve")


Persisting input arguments took 0.54s to run.If this happens often in your code, it can cause performance problems (results will be correct in all cases). The reason for this is probably some large input arguments for a wrapped function.



## Validation Curve

In [38]:
for param_name, param_range in PARAM_GRID.items():
    print(f"Validation Curve for: {param_name} -> {param_range}")
    train_scores, test_scores = model_selection.validation_curve(
        final, inputs, targets, cv=splitter, scoring=SCORING,
        param_name=param_name, param_range=param_range,
    )
    validation = pd.DataFrame(
        {
            "param_value": param_range,
            "mean_test_score": test_scores.mean(axis=1),
            "mean_train_score": train_scores.mean(axis=1),
        }
    )
    curve = px.line(
        validation, x="param_value", y=["mean_test_score", "mean_train_score"], title=f"Validation Curve: {param_name}"
    )
    curve.show()

Validation Curve for: regressor__max_depth -> [15, 20, 25]


Validation Curve for: regressor__n_estimators -> [150, 200, 250]
