## Modeling and Validation / CrossValidation

In [2]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np

In [3]:
data = pd.read_parquet(Path("data") / "processed_data.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,...,log_bike_count,year,month,day_of_week,hour,site_mean_count,site_std_count,is_weekend,is_rush_hour,is_holiday
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,2020,9,1,2,21.785157,35.345153,0,0,0
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.693147,2020,9,1,3,21.785157,35.345153,0,0,0
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,2020,9,1,4,21.785157,35.345153,0,0,0
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1.609438,2020,9,1,15,21.785157,35.345153,0,1,0
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,2.302585,2020,9,1,18,21.785157,35.345153,0,1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496827 entries, 0 to 496826
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[us]
 6   counter_installation_date  496827 non-null  datetime64[us]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  float64 

#### Define Features and Target

In [5]:
X = data.drop(columns=['bike_count', 'log_bike_count'])  
y = data['log_bike_count']

In [6]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (397461, 19)
Test set size: (99366, 19)


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,       # 20% of data goes to the test set
    random_state=42      # For reproducibility
)

In [8]:
X_train = X_train.drop(columns=['counter_id', 'counter_name', 'site_name', 'date', 'counter_installation_date', 'counter_technical_id', 'coordinates'], errors='ignore')
X_test = X_test.drop(columns=['counter_id', 'counter_name', 'site_name', 'date', 'counter_installation_date', 'counter_technical_id', 'coordinates'], errors='ignore')

In [9]:
X_train

Unnamed: 0,site_id,latitude,longitude,year,month,day_of_week,hour,site_mean_count,site_std_count,is_weekend,is_rush_hour,is_holiday
237173,100056226,48.830331,2.400551,2021,2,6,20,23.401103,26.561832,1,0,0
470041,100063175,48.885290,2.326660,2021,4,2,22,61.485402,80.921083,0,0,0
61785,100042374,48.848400,2.275860,2021,8,1,1,50.509039,70.476523,0,0,0
152034,100047547,48.826360,2.303030,2021,8,3,1,48.719746,62.971571,0,0,0
420821,100057380,48.864620,2.314440,2020,10,6,8,73.285619,90.637563,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
259178,100056327,48.864610,2.409690,2021,8,0,3,28.094217,34.684207,0,0,0
365838,100056335,48.862880,2.311790,2021,6,3,20,87.359706,100.798061,0,0,0
131932,100047546,48.829523,2.386990,2021,7,2,16,40.048195,58.893451,0,1,1
146867,100047547,48.826360,2.303030,2021,4,5,9,48.719746,62.971571,1,1,0


### Linear Model 

In [10]:
from sklearn.linear_model import LinearRegression

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred_log = model.predict(X_test)

# Reverse the log transformation for predictions and actuals
y_pred = np.expm1(y_pred_log)
y_actual = np.expm1(y_test)

# Evaluate RMSE
rmse = root_mean_squared_error(y_actual, y_pred)
print(f"Baseline RMSE (original scale): {rmse}")

Baseline RMSE (original scale): 73.13552957643445
