## Preprocessing and feature engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import datetime as dt

In this notebook, we will prepare the data for the modelling 

### Verifying steps from EDA

In [2]:
data = pd.read_parquet("data/train.parquet")  

In [3]:
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 counter_id                   0
counter_name                 0
site_id                      0
site_name                    0
bike_count                   0
date                         0
counter_installation_date    0
coordinates                  0
counter_technical_id         0
latitude                     0
longitude                    0
log_bike_count               0
dtype: int64


In [4]:
print(data.describe())

            site_id     bike_count                        date  \
count  4.968270e+05  496827.000000                      496827   
mean   1.053450e+08      60.191475  2021-03-08 07:25:59.668858   
min    1.000070e+08       0.000000         2020-09-01 01:00:00   
25%    1.000475e+08       5.000000         2020-12-05 22:00:00   
50%    1.000562e+08      29.000000         2021-03-08 11:00:00   
75%    1.000563e+08      79.000000         2021-06-09 14:00:00   
max    3.000147e+08    1302.000000         2021-09-09 23:00:00   
std    3.210346e+07      87.590566                         NaN   

        counter_installation_date       latitude      longitude  \
count                      496827  496827.000000  496827.000000   
mean   2019-04-04 07:24:35.245911      48.854343       2.345479   
min           2013-01-18 00:00:00      48.826360       2.265420   
25%           2018-11-29 00:00:00      48.840801       2.314440   
50%           2019-11-06 00:00:00      48.852090       2.353870   
75%

### Encoding Categorical Variables

In [5]:
# Recreate temporal columns from the 'date' column
data['year'] = pd.to_datetime(data['date']).dt.year
data['month'] = pd.to_datetime(data['date']).dt.month
data["day_of_week"] = data["date"].dt.dayofweek # 0 for Monday
data['hour'] = pd.to_datetime(data['date']).dt.hour

print(data[['date', 'year', 'month', 'day_of_week', 'hour']].head())

                     date  year  month  day_of_week  hour
48321 2020-09-01 02:00:00  2020      9            1     2
48324 2020-09-01 03:00:00  2020      9            1     3
48327 2020-09-01 04:00:00  2020      9            1     4
48330 2020-09-01 15:00:00  2020      9            1    15
48333 2020-09-01 18:00:00  2020      9            1    18


In [6]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_columns)

Categorical columns: Index(['counter_id', 'counter_name', 'site_name', 'coordinates',
       'counter_technical_id'],
      dtype='object')


### Handle numerical features

might delete later

### Feature Interaction 

In [7]:
## add something

### Feature Engineering

In [8]:
# Aggregate Statistics by site_id
site_stats = data.groupby('site_id')['bike_count'].agg(['mean', 'std']).reset_index()
site_stats.rename(columns={'mean': 'site_mean_count', 'std': 'site_std_count'}, inplace=True)

data = data.merge(site_stats, on='site_id', how='left')

Why: Sites with similar conditions (e.g., busy vs. quiet areas) might exhibit consistent traffic patterns. Aggregate features help capture site-specific behavior, such as average bike traffic.

In [9]:
# Weekend Indicator
data["is_weekend"] = (data["day_of_week"] >= 5).astype(int)

Why: We have seen in EDA that bike_counts is higher on weekdays than on weekends (likely due to commuting). This way we capture the differences. 

In [10]:
# Rush Hour Indicator
data['is_rush_hour'] = data['hour'].apply(lambda x: 1 if 6 <= x <= 9 or 15 <= x <= 18 else 0)

print(data[['hour', 'is_rush_hour']].head())

   hour  is_rush_hour
0     2             0
1     3             0
2     4             0
3    15             1
4    18             1


Why: The rush hour indicator is created to capture periods of high bike traffic, typically aligned with commuting times. It is based on the observation from the graph that bike counts peak around 6-9 AM and 3-6 PM, representing morning and evening rush hours. This feature helps the model identify patterns specific to these high-traffic periods, improving its ability to predict bike counts.

In [11]:
pip install holidays

Note: you may need to restart the kernel to use updated packages.


In [12]:
import holidays

france_holidays = holidays.FR()

data['is_holiday'] = data['date'].dt.date.apply(lambda x: 1 if x in france_holidays else 0)

print(data[['date', 'is_holiday']].head())

                 date  is_holiday
0 2020-09-01 02:00:00           0
1 2020-09-01 03:00:00           0
2 2020-09-01 04:00:00           0
3 2020-09-01 15:00:00           0
4 2020-09-01 18:00:00           0


In [13]:
data

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,...,log_bike_count,year,month,day_of_week,hour,site_mean_count,site_std_count,is_weekend,is_rush_hour,is_holiday
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.000000,2020,9,1,2,21.785157,35.345153,0,0,0
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.693147,2020,9,1,3,21.785157,35.345153,0,0,0
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.000000,2020,9,1,4,21.785157,35.345153,0,0,0
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1.609438,2020,9,1,15,21.785157,35.345153,0,1,0
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,2.302585,2020,9,1,18,21.785157,35.345153,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2021-09-09 06:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,6.100319,2021,9,3,6,58.326593,58.608311,0,1,0
496823,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2021-09-09 10:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,4.983607,2021,9,3,10,58.326593,58.608311,0,0,0
496824,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2021-09-09 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,5.389072,2021,9,3,15,58.326593,58.608311,0,1,0
496825,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2021-09-09 22:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,3.091042,2021,9,3,22,58.326593,58.608311,0,0,0


### Save processed file 

In [14]:
save_path = 'data/processed_data.parquet'

data.to_parquet(save_path, index=False)

print(f"Dataset saved to {save_path}")

Dataset saved to data/processed_data.parquet
