## Preprocessing and feature engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In this notebook, we will prepare the data for the modelling 

### Verifying steps from EDA

In [2]:
data = pd.read_parquet("data/train.parquet")  

In [3]:
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 counter_id                   0
counter_name                 0
site_id                      0
site_name                    0
bike_count                   0
date                         0
counter_installation_date    0
coordinates                  0
counter_technical_id         0
latitude                     0
longitude                    0
log_bike_count               0
dtype: int64


In [4]:
print(data.describe())

            site_id     bike_count                        date  \
count  4.968270e+05  496827.000000                      496827   
mean   1.053450e+08      60.191475  2021-03-08 07:25:59.668858   
min    1.000070e+08       0.000000         2020-09-01 01:00:00   
25%    1.000475e+08       5.000000         2020-12-05 22:00:00   
50%    1.000562e+08      29.000000         2021-03-08 11:00:00   
75%    1.000563e+08      79.000000         2021-06-09 14:00:00   
max    3.000147e+08    1302.000000         2021-09-09 23:00:00   
std    3.210346e+07      87.590566                         NaN   

        counter_installation_date       latitude      longitude  \
count                      496827  496827.000000  496827.000000   
mean   2019-04-04 07:24:35.245911      48.854343       2.345479   
min           2013-01-18 00:00:00      48.826360       2.265420   
25%           2018-11-29 00:00:00      48.840801       2.314440   
50%           2019-11-06 00:00:00      48.852090       2.353870   
75%

### Encoding Categorical Variables

In [5]:
# Recreate temporal columns from the 'date' column
data['year'] = pd.to_datetime(data['date']).dt.year
data['month'] = pd.to_datetime(data['date']).dt.month
data['day_of_week'] = pd.to_datetime(data['date']).dt.day_name()
data['hour'] = pd.to_datetime(data['date']).dt.hour

print(data[['date', 'year', 'month', 'day_of_week', 'hour']].head())

                     date  year  month day_of_week  hour
48321 2020-09-01 02:00:00  2020      9     Tuesday     2
48324 2020-09-01 03:00:00  2020      9     Tuesday     3
48327 2020-09-01 04:00:00  2020      9     Tuesday     4
48330 2020-09-01 15:00:00  2020      9     Tuesday    15
48333 2020-09-01 18:00:00  2020      9     Tuesday    18


In [6]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_columns)

Categorical columns: Index(['counter_id', 'counter_name', 'site_name', 'coordinates',
       'counter_technical_id', 'day_of_week'],
      dtype='object')


In [7]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
day_of_week_encoded = encoder.fit_transform(data[['day_of_week']])

day_of_week_columns = encoder.get_feature_names_out(['day_of_week'])
day_of_week_df = pd.DataFrame(
    day_of_week_encoded,
    columns=day_of_week_columns,
    index=data.index
)

data = pd.concat([data, day_of_week_df], axis=1)
print(day_of_week_df.head())

       day_of_week_Friday  day_of_week_Monday  day_of_week_Saturday  \
48321                 0.0                 0.0                   0.0   
48324                 0.0                 0.0                   0.0   
48327                 0.0                 0.0                   0.0   
48330                 0.0                 0.0                   0.0   
48333                 0.0                 0.0                   0.0   

       day_of_week_Sunday  day_of_week_Thursday  day_of_week_Tuesday  \
48321                 0.0                   0.0                  1.0   
48324                 0.0                   0.0                  1.0   
48327                 0.0                   0.0                  1.0   
48330                 0.0                   0.0                  1.0   
48333                 0.0                   0.0                  1.0   

       day_of_week_Wednesday  
48321                    0.0  
48324                    0.0  
48327                    0.0  
48330           

In [16]:
# Cyclical Encoding
data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)

data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

print(data[['hour', 'hour_sin', 'hour_cos']].head())

   hour  hour_sin      hour_cos
0     2  0.500000  8.660254e-01
1     3  0.707107  7.071068e-01
2     4  0.866025  5.000000e-01
3    15 -0.707107 -7.071068e-01
4    18 -1.000000 -1.836970e-16


### Handle numerical features

In [9]:
scaler = StandardScaler()
data['bike_count_scaled'] = scaler.fit_transform(data[['bike_count']])
data['log_bike_count_scaled'] = scaler.fit_transform(data[['log_bike_count']])

### Feature Interaction 

In [10]:
day_columns = [col for col in data.columns if col.startswith('day_of_week')]
for col in day_columns:
    data[f'hour_{col}_interaction'] = data['hour'] * data[col]

### Feature Engineering

In [11]:
# Aggregate Statistics by site_id
site_stats = data.groupby('site_id')['bike_count'].agg(['mean', 'std']).reset_index()
site_stats.rename(columns={'mean': 'site_mean_count', 'std': 'site_std_count'}, inplace=True)

data = data.merge(site_stats, on='site_id', how='left')

Why: Sites with similar conditions (e.g., busy vs. quiet areas) might exhibit consistent traffic patterns. Aggregate features help capture site-specific behavior, such as average bike traffic.

In [12]:
# Weekend Indicator
data['is_weekend'] = data['day_of_week_Saturday'] + data['day_of_week_Sunday']


print(data[['day_of_week_Saturday', 'day_of_week_Sunday', 'is_weekend']].head())

   day_of_week_Saturday  day_of_week_Sunday  is_weekend
0                   0.0                 0.0         0.0
1                   0.0                 0.0         0.0
2                   0.0                 0.0         0.0
3                   0.0                 0.0         0.0
4                   0.0                 0.0         0.0


Why: We have seen in EDA that bike_counts is higher on weekdays than on weekends (likely due to commuting). This way we capture the differences. 

In [18]:
# Rush Hour Indicator
data['is_rush_hour'] = data['hour'].apply(lambda x: 1 if 6 <= x <= 9 or 15 <= x <= 18 else 0)

# Verify the new column
print(data[['hour', 'is_rush_hour']].head())

   hour  is_rush_hour
0     2             0
1     3             0
2     4             0
3    15             1
4    18             1


Why: The rush hour indicator is created to capture periods of high bike traffic, typically aligned with commuting times. It is based on the observation from the graph that bike counts peak around 6-9 AM and 3-6 PM, representing morning and evening rush hours. This feature helps the model identify patterns specific to these high-traffic periods, improving its ability to predict bike counts.

In [19]:
pip install holidays

Collecting holidays
  Downloading holidays-0.62-py3-none-any.whl.metadata (26 kB)
Downloading holidays-0.62-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: holidays
Successfully installed holidays-0.62
Note: you may need to restart the kernel to use updated packages.


In [21]:
import holidays

france_holidays = holidays.FR()

data['is_holiday'] = data['date'].dt.date.apply(lambda x: 1 if x in france_holidays else 0)

print(data[['date', 'is_holiday']].head())

                 date  is_holiday
0 2020-09-01 02:00:00           0
1 2020-09-01 03:00:00           0
2 2020-09-01 04:00:00           0
3 2020-09-01 15:00:00           0
4 2020-09-01 18:00:00           0


### Save processed file 

In [22]:
save_path = 'data/processed_data.parquet'

data.to_parquet(save_path, index=False)

print(f"Dataset saved to {save_path}")

Dataset saved to data/processed_data.parquet
