# MAP 536 - Python for Data Science - Predicting Cyclist Traffic in Paris

## Prediction

### adding french holidays

Import all necessary packages

In [52]:

import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit


Load datasets & set target

In [53]:
# Load training and testing datasets
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")


In [54]:
train_data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,2.302585


In [55]:
train_data.drop(columns=['counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)
train_data.head()

Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count
48321,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01 02:00:00,48.846028,2.375429,0.0
48324,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-09-01 03:00:00,48.846028,2.375429,0.693147
48327,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01 04:00:00,48.846028,2.375429,0.0
48330,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-09-01 15:00:00,48.846028,2.375429,1.609438
48333,28 boulevard Diderot E-O,28 boulevard Diderot,9.0,2020-09-01 18:00:00,48.846028,2.375429,2.302585


In [56]:
# Ensure the 'date' column is in datetime format
train_data['date'] = pd.to_datetime(train_data['date'])

# Extract the hour first, then extract the date
train_data['hour'] = train_data['date'].dt.hour
train_data['date'] = train_data['date'].dt.date

# Now, train_data has separate columns for date and hour
train_data.head()

Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour
48321,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,2
48324,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-09-01,48.846028,2.375429,0.693147,3
48327,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,4
48330,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-09-01,48.846028,2.375429,1.609438,15
48333,28 boulevard Diderot E-O,28 boulevard Diderot,9.0,2020-09-01,48.846028,2.375429,2.302585,18


In [57]:
holiday_data = pd.read_csv(Path("data") / "jours_feries_metropole.csv")
holiday_data.rename(columns={'date': 'h_date', 'nom_jour_ferie': 'is_holiday'}, inplace=True)
holiday_data['h_date'] = pd.to_datetime(holiday_data['h_date']).dt.date
holiday_data.drop(columns=['zone','annee'], inplace=True) # we drop the redundant or irrelevant columns 
holiday_data.rename(columns={"A": "a", "B": "c"})


holiday_data.head()

Unnamed: 0,h_date,is_holiday
0,2003-01-01,1er janvier
1,2003-04-21,Lundi de Pâques
2,2003-05-01,1er mai
3,2003-05-08,8 mai
4,2003-05-29,Ascension


In [58]:
merged_train_data = pd.merge(train_data, holiday_data, left_on='date', right_on='h_date', how='left')
merged_train_data['date'] = pd.to_datetime(merged_train_data['date'])

merged_train_data['is_holiday'] = merged_train_data['is_holiday'].fillna(0)
merged_train_data['is_holiday'] = merged_train_data['is_holiday'].apply(lambda x: 1 if x != 0 else 0)



merged_train_data.head()

Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour,h_date,is_holiday
0,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,2,,0
1,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-09-01,48.846028,2.375429,0.693147,3,,0
2,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,4,,0
3,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-09-01,48.846028,2.375429,1.609438,15,,0
4,28 boulevard Diderot E-O,28 boulevard Diderot,9.0,2020-09-01,48.846028,2.375429,2.302585,18,,0


In [59]:
# check for xmas dates
merged_train_data[(merged_train_data['date'] == '2020-12-25')]


Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour,h_date,is_holiday
2377,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-12-25,48.846028,2.375429,0.693147,4,2020-12-25,1
2380,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-12-25,48.846028,2.375429,0.000000,6,2020-12-25,1
2383,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-12-25,48.846028,2.375429,0.000000,8,2020-12-25,1
2386,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-12-25,48.846028,2.375429,1.609438,10,2020-12-25,1
2389,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-12-25,48.846028,2.375429,0.000000,13,2020-12-25,1
...,...,...,...,...,...,...,...,...,...,...
450581,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,8.0,2020-12-25,48.839770,2.301980,2.197225,7,2020-12-25,1
450584,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,19.0,2020-12-25,48.839770,2.301980,2.995732,9,2020-12-25,1
450587,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,29.0,2020-12-25,48.839770,2.301980,3.401197,13,2020-12-25,1
450590,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,1.0,2020-12-25,48.839770,2.301980,0.693147,20,2020-12-25,1


In [60]:
# drop redundant date column
merged_train_data.drop(columns=['h_date'], inplace=True)
merged_train_data.head()


Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour,is_holiday
0,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,2,0
1,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-09-01,48.846028,2.375429,0.693147,3,0
2,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,4,0
3,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-09-01,48.846028,2.375429,1.609438,15,0
4,28 boulevard Diderot E-O,28 boulevard Diderot,9.0,2020-09-01,48.846028,2.375429,2.302585,18,0


In [61]:
# do the same for test data

# Ensure the 'date' column is in datetime format
test_data['date'] = pd.to_datetime(test_data['date'])

# Extract the hour first, then extract the date
test_data['hour'] = test_data['date'].dt.hour
test_data['date'] = test_data['date'].dt.date

test_data.drop(columns=['counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)

test_data.head()


Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour
56474,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2021-08-10,48.846028,2.375429,0.693147,5
56477,28 boulevard Diderot E-O,28 boulevard Diderot,2.0,2021-08-10,48.846028,2.375429,1.098612,6
56480,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2021-08-10,48.846028,2.375429,0.693147,7
56483,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2021-08-10,48.846028,2.375429,0.0,9
56486,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2021-08-10,48.846028,2.375429,0.693147,10


In [62]:
merged_test_data = pd.merge(test_data, holiday_data, left_on='date', right_on='h_date', how='left')
merged_test_data['date'] = pd.to_datetime(merged_test_data['date'])

merged_test_data['is_holiday'] = merged_test_data['is_holiday'].fillna(0)
merged_test_data['is_holiday'] = merged_test_data['is_holiday'].apply(lambda x: 1 if x != 0 else 0)

merged_test_data.drop(columns=['h_date'], inplace=True)
merged_test_data.head()

merged_test_data.head()

Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour,is_holiday
0,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2021-08-10,48.846028,2.375429,0.693147,5,0
1,28 boulevard Diderot E-O,28 boulevard Diderot,2.0,2021-08-10,48.846028,2.375429,1.098612,6,0
2,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2021-08-10,48.846028,2.375429,0.693147,7,0
3,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2021-08-10,48.846028,2.375429,0.0,9,0
4,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2021-08-10,48.846028,2.375429,0.693147,10,0


In [63]:
# select some potentially relevant features for prediction
selected_features = ['hour', 'day', 'month', 'latitude', 'longitude'] 

# Preprocessing pipeline for standardization
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), selected_features)
    ])

# Combine preprocessing and model training in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# define number of splits
tscv = TimeSeriesSplit(n_splits=5)

In [64]:
# Function to encode dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    X['date'] = pd.to_datetime(X['date'])
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    return X.drop(columns=["date"])

# Apply the function to both datasets
X_train = _encode_dates(merged_train_data)
X_test = _encode_dates(merged_test_data)

# Remove non-numeric columns
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[numeric_columns]
X_test = X_test[numeric_columns]

In [65]:
X_train_selected = X_train[selected_features]
y_train_log_count = y_train  

# Cross-validation scores
cross_val_scores = cross_val_score(model, X_train_selected, y_train_log_count, cv=tscv, scoring='neg_root_mean_squared_error')

# Average RMSE
avg_rmse = -np.mean(cross_val_scores)
print(f"Average RMSE: {avg_rmse}")

Average RMSE: 1.6746374102706898


higher RMSE, maybe due to some holidays (e.g. xmas) being rather "stay at home" and others being occasions to go out