# MAP 536 - Python for Data Science - Predicting Cyclist Traffic in Paris

In [207]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit

In [212]:
## + Weather data
# Load training and testing datasets & remove unnecessary cols
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")
train_data.drop(columns=['counter_name', 'site_name','counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)
test_data.drop(columns=['counter_name', 'site_name','counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)

#Load weather dataset and remove irrelevant columns
weather_data = pd.read_csv(Path("data") / "hourly-weather-data.csv")
weather_data = weather_data.drop(columns=['name', 'dew', 'precipprob', 'preciptype','uvindex','icon','stations', 'sealevelpressure', 'winddir', 'conditions', 'sealevelpressure', 'severerisk', 'solarradiation', 'solarenergy'])

# convert to datetime to merge them properly
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])

#merge with weather data
merged_train_data = pd.merge(train_data, weather_data, left_on='date', right_on='datetime', how='inner')
merged_test_data = pd.merge(test_data, weather_data, left_on='date', right_on='datetime', how='inner')
merged_train_data = merged_train_data.drop(columns=['datetime'])
merged_test_data = merged_test_data.drop(columns=['datetime'])


## School Holidays + Weather data
# import the holiday dataset
schoolholiday_data = pd.read_csv("data/fr-calendrier.csv", delimiter=';')
schoolholiday_data = schoolholiday_data[schoolholiday_data['zones'] == 'Zone C'] # Zone C is Paris
schoolholiday_data = schoolholiday_data.drop(columns=['description','location','annee_scolaire', 'zones'])

# Convert the date strings to datetime objects
schoolholiday_data['start_date'] = pd.to_datetime(schoolholiday_data['start_date'], utc=True).dt.date
schoolholiday_data['end_date'] = pd.to_datetime(schoolholiday_data['end_date'],utc=True).dt.date

# Generate a set of unique dates for each range in a row
unique_dates = set()
for _, row in schoolholiday_data.iterrows():
    unique_dates.update(pd.date_range(start=row['start_date'], end=row['end_date']))

# Convert the set back to a list and create a DataFrame
unique_dates_list = sorted(list(unique_dates)) 
schoolholiday_data = pd.DataFrame({'Date': unique_dates_list})

# merge with rest of the data
merged_train = pd.merge(merged_train_data, schoolholiday_data, left_on='date', right_on='Date', how='left')
merged_train['Date'] = merged_train['Date'].apply(lambda x: 0 if pd.isna(x) else 1)
merged_train.rename(columns={'Date': 'is_school_holiday'}, inplace=True)

merged_test = pd.merge(merged_test_data, schoolholiday_data, left_on='date', right_on='Date', how='left')
merged_test['Date'] = merged_test['Date'].apply(lambda x: 0 if pd.isna(x) else 1)
merged_test.rename(columns={'Date': 'is_school_holiday'}, inplace=True)


## School Holidays + Weather data + strike data
# import the strike dataset
from datetime import datetime  # Import the datetime class from the datetime module

# strike dates for public transport in Paris, retrieved from: https://www.cestlagreve.fr/calendrier/?lieu=74&secteur=14&mois=1&annee=2022
strike_dates = {'Date': [datetime(2020, 9, 17), datetime(2020, 12, 14), datetime(2020, 12, 16),
                        datetime(2021, 1, 21), datetime(2021, 2, 4), datetime(2021, 2, 15),
                        datetime(2021, 5, 21), datetime(2021, 6, 1), datetime(2021, 10, 5),
                        datetime(2021, 11, 17)]}

strike_data = pd.DataFrame(strike_dates)
strike_data

# Convert the date strings to datetime objects
strike_data['Date'] = pd.to_datetime(strike_data['Date'])


# merge with rest of the data
merged_train = pd.merge(merged_train, strike_data, left_on='date', right_on='Date', how='left')
merged_train['Date'] = merged_train['Date'].apply(lambda x: 0 if pd.isna(x) else 1)
merged_train.rename(columns={'Date': 'is_strike'}, inplace=True)

merged_test = pd.merge(merged_test, strike_data, left_on='date', right_on='Date', how='left')
merged_test['Date'] = merged_test['Date'].apply(lambda x: 0 if pd.isna(x) else 1)
merged_test.rename(columns={'Date': 'is_strike'}, inplace=True)
merged_test.head()


## School Holidays + Weather data + Strike data + Lockdown data
lockdown_data = pd.read_csv("data/lockdown-data.csv")
lockdown_data['datetime'] = pd.to_datetime(lockdown_data['datetime'])
merged_train['date'] = pd.to_datetime(merged_train['date'])
merged_test['date'] = pd.to_datetime(merged_test['date'])

merged_train = pd.merge(merged_train, lockdown_data, left_on='date', right_on='datetime', how='left')
merged_train = merged_train.drop(columns=['datetime'])
merged_test = pd.merge(merged_test, lockdown_data, left_on='date', right_on='datetime', how='left')
merged_test = merged_test.drop(columns=['datetime'])


## Encode the dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    X['date'] = pd.to_datetime(X['date'])
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    return X.drop(columns=["date"])

merged_train = _encode_dates(merged_train)
merged_test = _encode_dates(merged_test)

# define x and y
X_merged_train = merged_train.drop(columns=['bike_count', 'log_bike_count'])
Y_merged_train = merged_train['log_bike_count']

X_merged_test = merged_test.drop(columns=['bike_count', 'log_bike_count'])
Y_merged_test = merged_test['log_bike_count']

merged_train.head()

Unnamed: 0,bike_count,latitude,longitude,log_bike_count,temp,feelslike,humidity,precip,snow,snowdepth,...,is_school_holiday,is_strike,full_lockdown,partial_lockdown,school_closures,business_closures,year,month,day,weekday
0,0.0,48.846028,2.375429,0.0,13.3,13.3,77.67,0.0,0.0,0.0,...,0,0,0,0,0,0,2020,9,1,1
1,2.0,48.846028,2.375429,1.098612,13.3,13.3,77.67,0.0,0.0,0.0,...,0,0,0,0,0,0,2020,9,1,1
2,5.0,48.83436,2.377,1.791759,13.3,13.3,77.67,0.0,0.0,0.0,...,0,0,0,0,0,0,2020,9,1,1
3,1.0,48.83436,2.377,0.693147,13.3,13.3,77.67,0.0,0.0,0.0,...,0,0,0,0,0,0,2020,9,1,1
4,0.0,48.85372,2.35702,0.0,13.3,13.3,77.67,0.0,0.0,0.0,...,0,0,0,0,0,0,2020,9,1,1


### Linear Regression

In [214]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the data
    ('regressor', LinearRegression())  # Apply linear regression
])

X_merged_train = X_merged_train.fillna(X_merged_train.mean())

from sklearn.model_selection import KFold

# Create a KFold object with shuffling
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Use this in cross_val_score
scores = cross_val_score(pipeline, X_merged_train, Y_merged_train, cv=kf, scoring='neg_mean_squared_error')

# Calculate RMSE for each fold
rmse_scores = np.sqrt(-scores)

print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE:", np.mean(rmse_scores))



RMSE scores for each fold: [1.55264726 1.55290018 1.55622781 1.55799723 1.56364461]
Average RMSE: 1.5566834174322433


In [216]:
# Define the target variable 'y'
y = merged_train['log_bike_count'].copy()

# Drop the target variable and any other non-predictor columns to define the features 'X'
X = merged_train.drop(['log_bike_count', 'bike_count'], axis=1)  # Assuming 'bike_count' is also not a predictor

# Calculate the split index
split_index = int(len(X) * 0.7)

# Split the features and the target variable into training and cross-validation sets
X_train = X.iloc[:split_index]
X_cross_val = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_cross_val = y.iloc[split_index:]

# Your data is now split into training and cross-validation sets and is ready for model training and evaluation.

num_timesteps = 24

# Standardize your features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cross_val_scaled = scaler.transform(X_cross_val)

# Function to create sequences of time steps
def create_sequences(data, y, time_steps=num_timesteps):
    Xs, ys = [], []
    for i in range(len(data) - time_steps):
        v = data.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

# Reshape the training and cross-validation data
X_train_seq, y_train_seq = create_sequences(pd.DataFrame(X_train_scaled), y_train)
X_cross_val_seq, y_cross_val_seq = create_sequences(pd.DataFrame(X_cross_val_scaled), y_cross_val)

# Initialize the XGBoost regressor model
xgb_regressor = XGBRegressor(objective='reg:squarederror')

# Fit the model to the training data
xgb_regressor.fit(X_train_scaled, y_train)

# Predict on the cross-validation data
y_pred_xgb = xgb_regressor.predict(X_cross_val_scaled)

# Calculate the RMSE for the XGBoost model
rmse_xgb = np.sqrt(mean_squared_error(y_cross_val, y_pred_xgb))
print(f"Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: {rmse_xgb:.2f}")



Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: 1.31
