In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    precision_score,
    recall_score
)
import numpy as np
import pandas as pd
import joblib
import pickle
import datetime

# Machine Learning Investigation

#### The following notebook is how we downloaded, cleaned and optimised our data before choosing the appropriate model to use in our Application.

### Downloading and Cleaning
The final merged dataset was downloaded from brightspace and investigated

In [39]:
data = pd.read_csv("final_merged_data.csv")

### Dataset

This dataset has 78 keys that we can use for prediction. However, when we are calculating predictions for the user in the Application, we will be using the open weather API, thus we need to make sure explanatory variables we choose from the dataset are highly comparable to explanatory variables from the open weather API.

Of the 78 possible variables, many are very different to the variables in our API. The mismatch between the two datasets falls into 3 main categories:
- (i)               Too granular for API: Information like is_renting or is_returning is very in depth information that is not provided by even v3 of the API, although it would be useful if it was.
- (ii)            Broad information: Information like year, address, etc. is too broad and although we can generate corresponding information it is encoded better by other variables.
- (iii)            Irrelevant information / Not Provided by API: Most of the information is completely irrelevant and correspondingly not provided by the API. Over 60 metrics are indicators for soil, earth, grass, etc. quality and temperature. This has not logical link to the number of bikes available or not.

In the end we identified 5 explanatory variables that are different enough to include but important when discussing the probability of bike availability for each station.


In [None]:
# An average of min and max is used to a more comparable value to Open Weather API's temerature
# The same is true for humidity and pressure
data["temperature"] = (data["max_air_temperature_celsius"] + data["min_air_temperature_celsius"]) / 2
data["humidity"] = (data["max_relative_humidity_percent"] + data["min_relative_humidity_percent"]) / 2
data["pressure"] = (data["max_barometric_pressure_hpa"] + data["min_barometric_pressure_hpa"]) / 2

data = data[["num_bikes_available","station_id","year", "month", "day", "hour", "temperature", "humidity", "pressure"]]

# Time is taken in 5m intervals in the data set but our API can only do 1-3 hour blocks depending on the call
# Thus taking by hour will lead to hundreds of datapoints with the same time stamp even though they are actually different on a minute scale
# This can introduce bias into our mode so we group by hours and find the means for more accurate predictions down the line
data = data.groupby(["station_id", "year", "month", "day", "hour"]).mean().reset_index()
data["num_bikes_available"] = round(data["num_bikes_available"])

# From the time data we identify the day *"Monday", "Tuesday" etc.) so that we can then create a binary variable if it is the weekend or not
# Further information is available in the report
data["day_name"] = pd.to_datetime(data[["year", "month", "day"]]).dt.day_name()
data["is_weekday"] = data["day_name"].apply(lambda x: 0 if x == "Sunday" or x=="Saturday" else 1)

# The final dataset
data = data[["num_bikes_available","station_id", "is_weekday", "hour", "temperature", "humidity", "pressure"]]

In [42]:
data.head()

Unnamed: 0,num_bikes_available,station_id,is_weekday,hour,temperature,humidity,pressure
0,20.0,1,0,0,13.89,84.475,1002.4225
1,20.0,1,0,5,12.235,87.95,1000.2
2,22.0,1,0,6,11.263333,89.583333,1000.108333
3,25.0,1,0,7,11.236667,89.633333,1000.151667
4,30.0,1,0,8,11.68,86.3,1000.325


### Machine Learning
#### Model selection
There are many models we can choose for the machine learning portion of this application, of different flexibility. We shall use 5 of the smallest as SVMs or Neural networks will make .pkl files that are too large for our EC2. For Random forrest we choose number of estimators of 50 from advice from stack overflow (https://stackoverflow.com/questions/60768008/how-to-choose-n-estimators-in-randomforestclassifier), with the same logic, we choose 5  neigbors for KNN.

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=50, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

# Our metrics are a mixture of standard regression metrics MAE and R^2 and precision / recall as suggested by our lecture notes
metrics = ["mae", "r2", "precision", "recall"]
model_performance = {model: {m: [] for m in metrics} for model in models}

unique_ids = data["station_id"].unique()

### Metrics
##### MAE (Mean Absolute Error):
- Represents the average of the absolute errors between predicted and actual values.
- A lower MAE is better as it indicates that the model's predictions are closer to the actual values.

##### R² (R-squared):
- Indicates how well the model's predictions match the actual data.
- An R² closer to 1 indicates a better fit, with 0 implying no explanatory power and negative values suggesting worse performance than a simple mean model.

##### Precision:
- The proportion of true positive predictions out of all positive predictions made by the model.
- Higher precision indicates fewer false positives, meaning the model is good at identifying only relevant predictions.

##### Recall:
- The proportion of true positive predictions out of all actual positive instances.
- Higher recall means fewer false negatives, indicating that the model is good at identifying all positive instances.

In [None]:
for station_id in unique_ids:
    """
    For evaluation, it was decided to train and test separate models for each unique station. 
    For each station, relevant features were extracted and data was partitioned with a 70-30 train-test split.
    """
    id_data = data[data["station_id"] == station_id]
    
    X = id_data[["is_weekday", "hour", "temperature", "humidity", "pressure"]]
    y = id_data["num_bikes_available"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        y_test_bin = (y_test > 0).astype(int)
        y_pred_bin = (y_pred > 0).astype(int)

        precision = precision_score(y_test_bin, y_pred_bin, zero_division=0)
        recall = recall_score(y_test_bin, y_pred_bin, zero_division=0)

        model_performance[name]["mae"].append(mae)
        model_performance[name]["r2"].append(r2)
        model_performance[name]["precision"].append(precision)
        model_performance[name]["recall"].append(recall)

# Output average performance
print(" Average Performance Across All Stations:")
for name, scores in model_performance.items():
    print(f" Model: {name}")
    for metric in metrics:
        avg_score = np.mean(scores[metric])
        print(f"  {metric.upper():<9}: {avg_score:.4f}")

 Average Performance Across All Stations:
 Model: LinearRegression
  MAE      : 5.6490
  R2       : 0.1550
  PRECISION: 0.9013
  RECALL   : 0.9793
 Model: DecisionTree
  MAE      : 3.5598
  R2       : 0.3448
  PRECISION: 0.9334
  RECALL   : 0.9368
 Model: RandomForest
  MAE      : 3.2262
  R2       : 0.6248
  PRECISION: 0.9014
  RECALL   : 0.9825
 Model: KNN
  MAE      : 4.0989
  R2       : 0.4495
  PRECISION: 0.9044
  RECALL   : 0.9816
 Model: GradientBoosting
  MAE      : 3.8314
  R2       : 0.5351
  PRECISION: 0.9030
  RECALL   : 0.9792


### Model Analysis
##### Linear Regression
- MAE is relatively high, indicating that the model's predictions are off by an average of ~5.65 units.
- R² is very low (0.155), meaning the model is explaining very little of the variance in the data.
- Precision and Recall are decent, but the low R² suggests that this model is not a great fit for the data. The high Recall is a positive, indicating the model does a good job of identifying most positive instances, but the MAE and low R² show that it's not very accurate.

##### Decision Tree
- The MAE is lower than Linear Regression, indicating the model is predicting values more accurately.
- The R² is higher than Linear Regression, but still not great. It shows that the model explains about 34% of the variance in the data.
- Precision and Recall are both higher than Linear Regression, meaning this model is better at identifying relevant instances (higher precision) and missing fewer relevant predictions (higher recall).

##### Random Forrest
- MAE is lower than both Linear Regression and Decision Tree, showing better predictive accuracy.
- R² is quite strong at 0.6248, indicating that this model is explaining more than 60% of the variance in the data. This is a significant improvement.
- Precision and Recall are comparable to Decision Tree, with Recall being exceptionally high at 0.9825 (almost perfect at identifying relevant instances).
- This model is performing very well across multiple metrics, particularly in terms of R², which shows it’s a much better fit than simpler models.

##### KNN
- The MAE is higher than Random Forest and Decision Tree, but lower than Linear Regression. It’s not the best in terms of accuracy.
- R² is also weaker than Random Forest, but still explains about 45% of the variance, which is decent.
- Precision and Recall are both high, but slightly worse than Decision Tree and Random Forest. Still, the model identifies most positive instances, with only a small drop in Precision compared to others.
- While decent overall, KNN doesn't perform as well as Random Forest in terms of R².

##### Gradient Boosting
- The MAE is similar to KNN, but higher than Random Forest and Decision Tree.
- The R² (0.5351) is better than Decision Tree and KNN, but not as good as Random Forest.
- Precision and Recall are strong, with Recall close to the highest (0.9792). However, its Precision is slightly lower than Decision Tree.
- Gradient Boosting performs better than Decision Tree but still lags behind Random Forest in terms of both R² and MAE.

##### Summary
- Random Forrest is the best overall with the lowest MAE, highest  R² and very good Recall.


### Building the Model
Using Random Forrest Regressor, we will now make a pkl file to predict bike availability for each station

In [48]:
for id in unique_ids:
    # capacity = data[data["station_id"] == id]["capacity"].iloc[0]
    id_data = data[data["station_id"] == id]
    X = id_data[["is_weekday", "hour", "temperature", "humidity", "pressure"]]
    y = id_data["num_bikes_available"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = RandomForestRegressor(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)


    model_filename = f"avail_station_{id}.pkl"
    with open(model_filename, "wb") as file:
        pickle.dump(model, file)

    print(f"Model saved to {model_filename} for station {id}")



Model saved to avail_station_1.pkl for station 1
Model saved to avail_station_2.pkl for station 2
Model saved to avail_station_3.pkl for station 3
Model saved to avail_station_4.pkl for station 4
Model saved to avail_station_5.pkl for station 5
Model saved to avail_station_6.pkl for station 6
Model saved to avail_station_7.pkl for station 7
Model saved to avail_station_8.pkl for station 8
Model saved to avail_station_9.pkl for station 9
Model saved to avail_station_10.pkl for station 10
Model saved to avail_station_11.pkl for station 11
Model saved to avail_station_12.pkl for station 12
Model saved to avail_station_13.pkl for station 13
Model saved to avail_station_14.pkl for station 14
Model saved to avail_station_15.pkl for station 15
Model saved to avail_station_16.pkl for station 16
Model saved to avail_station_17.pkl for station 17
Model saved to avail_station_18.pkl for station 18
Model saved to avail_station_19.pkl for station 19
Model saved to avail_station_20.pkl for station 2