### IMPORTING NECESSARY LIBRARIES

In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib as plt
import joblib
import pickle

In [136]:
df=pd.read_csv("final_merged_data.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298946 entries, 0 to 298945
Data columns (total 78 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   last_reported                           298946 non-null  object 
 1   station_id                              298946 non-null  int64  
 2   num_bikes_available                     298946 non-null  int64  
 3   num_docks_available                     298946 non-null  int64  
 4   is_installed                            298946 non-null  bool   
 5   is_renting                              298946 non-null  bool   
 6   is_returning                            298946 non-null  bool   
 7   name                                    298946 non-null  object 
 8   address                                 298946 non-null  object 
 9   lat                                     298946 non-null  float64
 10  lon                                     2989

In [138]:
df.shape

(298946, 78)

### DATA PREPROCESSING

#### Converting `station_id` as categorical type for the models below so that it is not treated as a numerical value

In [142]:
df['station_id']=df['station_id'].astype('category')

#### Checking the unique values of fields to analyze the cardinalities 

In [192]:
df['station_id'].unique()

[10, 100, 109, 11, 114, ..., 87, 92, 96, 99, 70]
Length: 115
Categories (115, int64): [1, 2, 3, 4, ..., 114, 115, 116, 117]

In [147]:
df['day'].unique()


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [149]:
df['hour'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23], dtype=int64)

#### Encoding the `hour` values as cyclic using sin and cos so that the model understands the 24-hour pattern

In [152]:
# https://harrisonpim.com/blog/the-best-way-to-encode-dates-times-and-other-cyclical-features
df['Hour_sin'] = np.sin(2 * np.pi * df['hour'] / 23)
df['Hour_cos'] = np.cos(2 * np.pi * df['hour'] / 23)

#### Referred from: https://saturncloud.io/blog/pandas-get-day-of-week-from-date-type-column/#2

In [155]:
df['date']=pd.to_datetime(df[['year','month','day']],errors='coerce')
print(df['date'])

0        2024-12-01
1        2024-12-01
2        2024-12-01
3        2024-12-01
4        2024-12-01
            ...    
298941   2024-12-31
298942   2024-12-31
298943   2024-12-31
298944   2024-12-31
298945   2024-12-31
Name: date, Length: 298946, dtype: datetime64[ns]


In [157]:
# https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.dayofweek.html
df['day_of_week']=df['date'].dt.dayofweek

In [159]:
print(df['day_of_week'].unique())

[6 0 1 2 3 4 5]


#### Creating a field `is_weekday` to encode weekdays as `0` and weekends as `1`

In [162]:
df['is_weekday'] = df['day_of_week'].apply(lambda x:0 if x<5 else 1)

df['is_weekday'].unique()

### Feature Selection for the model
- The model is fed with features such as `station_id` a location-based feature ,`max_air_temperature_celsius` and `max_relative_humidity_percent` as weather-based features and `Hour_sin`, `Hour_cos`and `is_weekday` as time-based features
-  The aim is to create a global model to predict the `availability of bikes`

### DEFINING FEATURES AND TARGET FOR THE MODEL

In [195]:
features=['station_id','max_air_temperature_celsius','max_relative_humidity_percent','Hour_sin','Hour_cos','is_weekday']
target='num_bikes_available'


In [197]:
X = df[features]
y = df[target]

#### SPLITTING THE DATA INTO TEST AND TRAINING DATA

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### LINEAR REGRESSION MODEL

#### Referred from the source code provided in the lectures

In [201]:
model = LinearRegression()
model.fit(X_train, y_train)

In [203]:
y_pred = model.predict(X_test)

In [205]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error: 8.143639345584965
R² Score: 0.0003843833144947517


In [207]:
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")
print(f"Intercept: {model.intercept_}")


Model Coefficients:
station_id: -0.0001796111048468674
max_air_temperature_celsius: 0.015243035627137745
max_relative_humidity_percent: 0.007677486926593067
Hour_sin: 0.045566526473427585
Hour_cos: 0.2838383918796009
is_weekday: 0.2151610266791098
Intercept: 11.445657288038356


In [209]:
model_filename = "bike_availability_model.joblib"
joblib.dump(model, model_filename)

['bike_availability_model.joblib']

In [211]:
print(f"Model saved to {model_filename}")
model_filename = "bike_availability_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {model_filename}")

Model saved to bike_availability_model.joblib
Model saved to bike_availability_model.pkl


### Analysis of the model so far
- The r2 score is near to 0 and mean absolute error is very large
- The model doesnt seem to be a good fit for predicting `availability of bikes`
- Therefore also testing the random forest regressor model

### RANDOM FOREST REGRESSOR MODEL

In [217]:
regressor = RandomForestRegressor(n_estimators=100,random_state=0)
regressor = regressor.fit(X_train, y_train)
score = regressor.score( X_test,y_test)
score

0.8682956358985312

In [220]:
y_pred_rf = regressor.predict(X_test)
mac = mean_absolute_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)
print(f"MAE: {mac}, R²: {r2}")

# Feature importances:
importances = regressor.feature_importances_
for feature, imp in zip(features, importances):
    print(f"{feature}: {imp}")

MAE: 2.075444812845526, R²: 0.8682956358985312
station_id: 0.3982683511759866
max_air_temperature_celsius: 0.21825668316892174
max_relative_humidity_percent: 0.16886898528210983
Hour_sin: 0.06662066649970703
Hour_cos: 0.08651168061151267
is_weekday: 0.06147363326176229


In [222]:
importances = regressor.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': features, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)


                         Feature  Gini Importance
0                     station_id         0.398268
1    max_air_temperature_celsius         0.218257
2  max_relative_humidity_percent         0.168869
4                       Hour_cos         0.086512
3                       Hour_sin         0.066621
5                     is_weekday         0.061474


### Analysis of Random Forest Regressor
- The r2 score is : `0.86`and mae value is `2.07 ` which says that this model is a good fit comparing to the linear regressor model
- Better fit than linear regression


In [233]:
model_filename = "number_of_bikes_available_model.joblib"
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")


Model saved to number_of_bikes_available_model.joblib
Model saved to number_of_bikes_available_model.joblib


In [237]:
model_filename = "number_of_bikes_available_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {model_filename}")

Model saved to number_of_bikes_available_model.pkl
