# MAP 536 - Python for Data Science - Predicting Cyclist Traffic in Paris

## Exploratory Data Analysis

## Prediction

### 1 - without weather dataset

Import all necessary packages

In [44]:

import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path


Load datasets & set target

In [45]:
# Load training and testing datasets
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")

y_train = train_data['log_bike_count']
y_test = test_data['log_bike_count']

Data preparation

In [46]:
# select some potentially relevant features for prediction
selected_features = ['hour', 'day', 'month', 'latitude', 'longitude'] 

# Preprocessing pipeline for standardization
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), selected_features)
    ])

# Combine preprocessing and model training in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# define number of splits
tscv = TimeSeriesSplit(n_splits=5)

In [47]:
# Function to encode dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    X['date'] = pd.to_datetime(X['date'])
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    return X.drop(columns=["date"])

# Apply the function to both datasets
X_train = _encode_dates(train_data)
X_test = _encode_dates(test_data)

# Remove non-numeric columns
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[numeric_columns]
X_test = X_test[numeric_columns]

In [48]:
X_train

Unnamed: 0,site_id,bike_count,latitude,longitude,log_bike_count,year,month,day,weekday,hour
48321,100007049,0.0,48.846028,2.375429,0.000000,2020,9,1,1,2
48324,100007049,1.0,48.846028,2.375429,0.693147,2020,9,1,1,3
48327,100007049,0.0,48.846028,2.375429,0.000000,2020,9,1,1,4
48330,100007049,4.0,48.846028,2.375429,1.609438,2020,9,1,1,15
48333,100007049,9.0,48.846028,2.375429,2.302585,2020,9,1,1,18
...,...,...,...,...,...,...,...,...,...,...
928450,300014702,51.0,48.839770,2.301980,3.951244,2021,8,8,6,18
928453,300014702,1.0,48.839770,2.301980,0.693147,2021,8,9,0,2
928456,300014702,61.0,48.839770,2.301980,4.127134,2021,8,9,0,8
928459,300014702,44.0,48.839770,2.301980,3.806662,2021,8,9,0,10


Prediction & RMSE

In [36]:
X_train_selected = X_train[selected_features]
y_train_log_count = y_train  

# Cross-validation scores
cross_val_scores = cross_val_score(model, X_train_selected, y_train_log_count, cv=tscv, scoring='neg_root_mean_squared_error')

# Average RMSE
avg_rmse = -np.mean(cross_val_scores)
print(f"Average RMSE: {avg_rmse}")

Average RMSE: 1.6132712130472


### 2 - with holidays

Load the dataset

In [62]:
holiday_data = pd.read_csv(Path("data") / "jours_feries_metropole.csv")
holiday_data.rename(columns={'date': 'h_date'}, inplace=True)
holiday_data['h_date'] = pd.to_datetime(holiday_data['h_date']).dt.date



In [63]:
holiday_data.head()

Unnamed: 0,h_date,annee,zone,nom_jour_ferie
0,2003-01-01,2003,Métropole,1er janvier
1,2003-04-21,2003,Métropole,Lundi de Pâques
2,2003-05-01,2003,Métropole,1er mai
3,2003-05-08,2003,Métropole,8 mai
4,2003-05-29,2003,Métropole,Ascension


In [81]:
# Load training and testing datasets
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")

y_train = train_data['log_bike_count']
y_test = test_data['log_bike_count']

In [82]:
train_data.drop(columns=['counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)

# Ensure the 'date' column is in datetime format
train_data['date'] = pd.to_datetime(train_data['date'])

# Extract the hour first, then extract the date
train_data['hour'] = train_data['date'].dt.hour
train_data['date'] = train_data['date'].dt.date

# Now, train_data has separate columns for date and hour
train_data.head()


Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour
48321,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,2
48324,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-09-01,48.846028,2.375429,0.693147,3
48327,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.0,4
48330,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-09-01,48.846028,2.375429,1.609438,15
48333,28 boulevard Diderot E-O,28 boulevard Diderot,9.0,2020-09-01,48.846028,2.375429,2.302585,18


In [97]:
# Convert the 'date' column in train_data to datetime
train_data['date'] = pd.to_datetime(train_data['date'])

# Extract the date part for merging
train_data['merge_date'] = train_data['date'].dt.date

# Merge the datasets on the extracted date column
merged_data = pd.merge(train_data, holiday_data, left_on='merge_date', right_on='h_date', how='left')


merged_data

Unnamed: 0,counter_name,site_name,bike_count,date,latitude,longitude,log_bike_count,hour,merge_date,h_date,annee,zone,nom_jour_ferie
0,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.000000,2,2020-09-01,,,,
1,28 boulevard Diderot E-O,28 boulevard Diderot,1.0,2020-09-01,48.846028,2.375429,0.693147,3,2020-09-01,,,,
2,28 boulevard Diderot E-O,28 boulevard Diderot,0.0,2020-09-01,48.846028,2.375429,0.000000,4,2020-09-01,,,,
3,28 boulevard Diderot E-O,28 boulevard Diderot,4.0,2020-09-01,48.846028,2.375429,1.609438,15,2020-09-01,,,,
4,28 boulevard Diderot E-O,28 boulevard Diderot,9.0,2020-09-01,48.846028,2.375429,2.302585,18,2020-09-01,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
455158,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,51.0,2021-08-08,48.839770,2.301980,3.951244,18,2021-08-08,,,,
455159,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,1.0,2021-08-09,48.839770,2.301980,0.693147,2,2021-08-09,,,,
455160,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,61.0,2021-08-09,48.839770,2.301980,4.127134,8,2021-08-09,,,,
455161,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,44.0,2021-08-09,48.839770,2.301980,3.806662,10,2021-08-09,,,,


In [101]:
merged_data[merged_data['date']] == '2021-08-09'


KeyError: "None of [DatetimeIndex(['2020-09-01', '2020-09-01', '2020-09-01', '2020-09-01',\n               '2020-09-01', '2020-09-01', '2020-09-01', '2020-09-02',\n               '2020-09-02', '2020-09-02',\n               ...\n               '2021-08-07', '2021-08-08', '2021-08-08', '2021-08-08',\n               '2021-08-08', '2021-08-08', '2021-08-09', '2021-08-09',\n               '2021-08-09', '2021-08-09'],\n              dtype='datetime64[ns]', length=455163, freq=None)] are in the [columns]"