# Naive Baseline for Bike Count Prediction

This notebook implements a simple baseline model for predicting bike counts in Paris. The baseline strategy uses the bike count from the same day of the previous week as the prediction, leveraging weekly patterns in cycling behavior.

This baseline serves as a reference point to evaluate the performance of more sophisticated machine learning models.

This enabled us to get an idea of what score can we get using simple solutions and what is the baseline we should aim to improve (score of 0.674)

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
train = pd.read_parquet(Path("data") / "train.parquet")
train.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [4]:
test = pd.read_parquet(Path("data") / "final_test.parquet")
test.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In [5]:
train["date"].max()

Timestamp('2021-09-09 23:00:00')

In [8]:
test["date"].max()

Timestamp('2021-10-18 21:00:00')

In [None]:
def encode_dates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract datetime features from the 'date' column.
    
    This function extracts temporal features (year, month, day, weekday, hour)
    from the date column to be used as predictive features.
    
    Args:
        df (pd.DataFrame): DataFrame with 'date' column
    
    Returns:
        pd.DataFrame: DataFrame with additional temporal features
    """
    df = df.copy()  # modify a copy of df
    # Encode the date information from the DateOfDeparture columns
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.weekday
    df["hour"] = df["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return df

In [None]:
def naive_baseline(df_train: pd.DataFrame) -> pd.DataFrame:
    """
    Create baseline predictions using the last available bike count for each (counter, weekday, hour) combination.
    
    This baseline strategy leverages weekly patterns in cycling behavior by using
    the bike count from the same weekday and hour from the most recent week available.
    The approach assumes that bike usage patterns are consistent week-to-week.
    
    Args:
        df_train (pd.DataFrame): Training data with columns ['counter_id', 'weekday', 'hour', 'log_bike_count']
    
    Returns:
        pd.DataFrame: Baseline predictions grouped by counter_id, weekday, and hour
    """
    df_train = df_train.sort_values(by=["date"], ascending=True)
    
    # Group by counter, weekday, and hour to capture weekly patterns
    # Use the most recent (last) bike count for each combination
    df_preds = df_train.groupby(
        by=["counter_id", "weekday", "hour"],
        as_index=False
    ).agg({
        "log_bike_count": "last",
    })
    return df_preds

In [32]:
df_train = encode_dates(df=train)
df_train.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,year,month,day,weekday,hour
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,2
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147,2020,9,1,1,3


In [33]:
df_preds = naive_baseline(df_train=df_train)
df_preds.head(3)

  df_preds = df_train.groupby(


Unnamed: 0,counter_id,weekday,hour,log_bike_count
0,100007049-101007049,0,0,2.302585
1,100007049-101007049,0,1,1.609438
2,100007049-101007049,0,2,1.098612


In [None]:
def predict_test(df_test: pd.DataFrame, df_preds: pd.DataFrame) -> pd.DataFrame:
    """
    Generate predictions for test data using baseline model.
    
    This function merges test data with baseline predictions based on
    counter_id, weekday, and hour combinations.
    
    Args:
        df_test (pd.DataFrame): Test data with date information
        df_preds (pd.DataFrame): Baseline predictions from naive_baseline function
    
    Returns:
        pd.DataFrame: Test data with predicted log_bike_count values
    """
    df_test = encode_dates(df=df_test)
    df_test = df_test[["counter_id", "weekday", "hour"]]
    df_preds = df_preds[["counter_id", "weekday", "hour", "log_bike_count"]]
    df_test_preds = pd.merge(
        left=df_test, right=df_preds, on=["counter_id", "weekday", "hour"]
    )
    return df_test_preds

In [35]:
df_test_preds = predict_test(df_test=test, df_preds=df_preds)
df_test_preds.head(5)

Unnamed: 0,counter_id,weekday,hour,log_bike_count
0,100007049-102007049,4,1,0.0
1,100007049-102007049,4,13,1.94591
2,100007049-102007049,4,17,1.791759
3,100007049-102007049,4,19,1.098612
4,100007049-102007049,4,22,0.693147


In [36]:
df_test_preds["log_bike_count"].isna().sum()

np.int64(0)

In [22]:
df_test_preds["log_bike_count"].count()

np.int64(51440)