# MAP 536 - Python for Data Science - Predicting Cyclist Traffic in Paris

## Exploratory Data Analysis

## Prediction

Import all necessary packages

In [93]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path

Load datasets & set target

In [94]:
# Load training and testing datasets
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")

y_train = train_data['log_bike_count']
y_test = test_data['log_bike_count']


Data preparation

In [95]:
# select some potentially relevant features for prediction
selected_features = ['hour', 'day', 'month', 'latitude', 'longitude'] 

# Preprocessing pipeline for standardization
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), selected_features)
    ])

# Combine preprocessing and model training in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# define number of splits
tscv = TimeSeriesSplit(n_splits=5)


Prediction & RMSE

In [96]:
X_train_selected = X_train[selected_features]
y_train_log_count = y_train  

# Cross-validation scores
cross_val_scores = cross_val_score(model, X_train_selected, y_train_log_count, cv=tscv, scoring='neg_root_mean_squared_error')

# Average RMSE
avg_rmse = -np.mean(cross_val_scores)
print(f"Average RMSE: {avg_rmse}")


Average RMSE: 1.6132712130472
