# Create an ML model for predicting how many available bicis!

based on EDA discoveries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [56]:
valenbisi = pd.read_csv("data/VALENBISI.csv")
valenbisi

Unnamed: 0,Direction,Number,Active,Free_bici,Free_stand,Total_stands,Ticket,Update_date,Latitude,Longitude,Folder_datetime
0,"Colón, 60",16,1,1,19,20,1,2024-01-19 00:09:53,39.470092,-0.370433,2024-01-19 00:15:01
1,Plaza de Tetuán,9,1,0,25,25,1,2024-01-19 00:09:53,39.474355,-0.369930,2024-01-19 00:15:01
2,Micer Mascó - Rodriguez Fornos,81,1,5,15,20,0,2024-01-19 00:09:53,39.475128,-0.360978,2024-01-19 00:15:01
3,General Elio - Llano del Real,83,1,0,25,25,1,2024-01-19 00:09:53,39.477585,-0.366970,2024-01-19 00:15:01
4,Blasco Ibañez - Mestre Ripoll,100,1,18,2,20,0,2024-01-19 00:09:53,39.471634,-0.338150,2024-01-19 00:15:01
...,...,...,...,...,...,...,...,...,...,...,...
11545222,Alcasser - Poeta Alberto Lista,265,1,5,10,15,0,2025-04-14 23:49:17,39.470973,-0.408117,2025-04-15 00:00:06
11545223,Ninot - Regino Mas,270,1,0,16,16,0,2025-04-14 23:49:17,39.500075,-0.392889,2025-04-15 00:00:06
11545224,San Francisco de Paula - Castell de Pop,274,1,13,2,15,0,2025-04-14 23:49:17,39.448070,-0.333188,2025-04-15 00:00:06
11545225,Valle de la Ballestera - Pio Baroja,244,1,9,11,20,0,2025-04-14 23:49:17,39.478506,-0.406136,2025-04-15 00:00:06


## Feature Engineering

In [None]:
# possible features to add: ######################################
# typical variations for each station (does it normally vary a lot or little)
# last number of free bikes
# last time updated

def feature_engineering(data):
    df = data.copy()
    # TIME
    # create hour of day as a sin/cosine feature
    df['hour_sin'] = np.sin(2 * np.pi * pd.to_datetime(df['Update_date']).dt.hour / 24)
    # season
    df['season'] = pd.to_datetime(df['Update_date']).dt.month % 12 // 3 + 1
    # weekday
    df['is_weekday'] = pd.to_datetime(df['Update_date']).dt.weekday < 5
    df['is_weekday'] = df['is_weekday'].astype(int)

    # since timedependent - add free-bici from timestep 

    df = df.drop(columns=['Update_date', 'Folder_datetime'])

    # dont need both free_stand and total_stand?

    # Direction is unique in feature Number
    df = df.drop(columns=['Direction'])

    return df

In [58]:
prep_valenbisi = feature_engineering(valenbisi)
prep_valenbisi 

Unnamed: 0,Number,Active,Free_bici,Free_stand,Total_stands,Ticket,Latitude,Longitude,hour_sin,season,is_weekday
0,16,1,1,19,20,1,39.470092,-0.370433,0.000000,1,1
1,9,1,0,25,25,1,39.474355,-0.369930,0.000000,1,1
2,81,1,5,15,20,0,39.475128,-0.360978,0.000000,1,1
3,83,1,0,25,25,1,39.477585,-0.366970,0.000000,1,1
4,100,1,18,2,20,0,39.471634,-0.338150,0.000000,1,1
...,...,...,...,...,...,...,...,...,...,...,...
11545222,265,1,5,10,15,0,39.470973,-0.408117,-0.258819,2,1
11545223,270,1,0,16,16,0,39.500075,-0.392889,-0.258819,2,1
11545224,274,1,13,2,15,0,39.448070,-0.333188,-0.258819,2,1
11545225,244,1,9,11,20,0,39.478506,-0.406136,-0.258819,2,1


# Models

In [3]:
import joblib

def save_model(model, X, filename):
    """Save the model to a file."""
    # .pkl
    joblib.dump((model, X.columns.tolist()), filename)
    #model, feature_names = joblib.load("model.pkl")

## XGBoost

In [59]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#xgboost model on prep_valenbisi, with Free_bici as target variable
X = prep_valenbisi.drop(columns=['Free_bici'])
y = prep_valenbisi['Free_bici']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.1, max_depth=6)
model.fit(X_train, y_train)
# Evaluate the model
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


ImportError: sklearn needs to be installed in order to use this module

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define features and target
X = prep_valenbisi.drop(columns=['Free_bici'])
y = prep_valenbisi['Free_bici']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

save_model(model, X, "model.pkl")

Mean Squared Error (Random Forest): 0.95


In [4]:
#MAE # 0.98 ok