In [1]:
import numpy as np
import pandas as pd
import pickle
from statsmodels.tsa.statespace.sarimax import SARIMAX

from utils_functions import *

## Functions

In [2]:
def sarimax_forecast(riderships_data):
    """Generate a SARIMAX forecast for the next month based on ridership data.

    This function fits a SARIMAX model to the provided ridership data and 
    forecasts values for the number of days in the next month.

    Args:
        riderships_data (pd.DataFrame): A DataFrame containing a column 
            'number_of_riderships' representing the ridership time series data.

    Returns:
        pd.DataFrame: A DataFrame containing the forecasted values for the next month.
            Columns:
            - 'predicted_mean': Forecasted mean values.
            - 'lower_bound': Lower bound of the confidence interval.
            - 'upper_bound': Upper bound of the confidence interval.
    """
    all_data = riderships_data['number_of_riderships']

    sarimax_model = SARIMAX(all_data, order=(1, 1, 1), seasonal_order=(2, 0, 0, 7))
    sarimax_fitted = sarimax_model.fit()

    forecast_steps = days_in_next_month()
    forecast_obj = sarimax_fitted.get_forecast(steps=forecast_steps)
    forecast_mean = forecast_obj.predicted_mean
    conf_int = forecast_obj.conf_int(alpha=0.4)

    forecasts = pd.DataFrame({
        'predicted_mean': forecast_mean,
        'lower_bound': conf_int.iloc[:, 0],
        'upper_bound': conf_int.iloc[:, 1]
    })

    return forecasts

## Import Data

In [3]:
file_key = 'data-transformed/run-1731935541465-part-r-00000.csv'
mta_subway_df = read_s3_csv_to_dataframe(file_key)
mta_subway_df = mta_subway_df.sort_values(by=["station_complex_id", "created_date"], ascending=True)
mta_subway_df.set_index('created_date', inplace=True)
mta_subway_by_station = mta_subway_df[['station_complex_id', 'number_of_riderships']]
mta_subway_by_station = mta_subway_by_station[mta_subway_by_station.station_complex_id.isin([611])]

In [4]:
mta_subway_df.head()

Unnamed: 0_level_0,station_complex_id,station_complex,latitude,longitude,georeference,number_of_riderships,year_period
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-01,8,"5 Av/59 St (N,R,W)",40.764812,-73.97335,POINT (-73.97335 40.764812),8909.0,2023
2023-01-02,8,"5 Av/59 St (N,R,W)",40.764812,-73.97335,POINT (-73.97335 40.764812),7475.0,2023
2023-01-03,8,"5 Av/59 St (N,R,W)",40.764812,-73.97335,POINT (-73.97335 40.764812),9291.0,2023
2023-01-04,8,"5 Av/59 St (N,R,W)",40.764812,-73.97335,POINT (-73.97335 40.764812),10697.0,2023
2023-01-05,8,"5 Av/59 St (N,R,W)",40.764812,-73.97335,POINT (-73.97335 40.764812),10391.0,2023


In [5]:
mta_subway_by_station.head()

Unnamed: 0_level_0,station_complex_id,number_of_riderships
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01,610,36101.0
2023-01-02,610,40441.0
2023-01-03,610,87808.0
2023-01-04,610,93349.0
2023-01-05,610,90989.0


In [6]:
mta_subway_by_station.station_complex_id.unique()

array([610])

## Make Predictions to all stations

In [7]:
forecasts = mta_subway_by_station.groupby('station_complex_id').apply(sarimax_forecast)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.12217D+01    |proj g|=  1.39618D-01

At iterate    5    f=  1.11628D+01    |proj g|=  7.07077D-03

At iterate   10    f=  1.11575D+01    |proj g|=  2.06515D-03

At iterate   15    f=  1.11563D+01    |proj g|=  2.02076D-04

At iterate   20    f=  1.11563D+01    |proj g|=  3.16447D-05

At iterate   25    f=  1.11562D+01    |proj g|=  4.89619D-04

At iterate   30    f=  1.09077D+01    |proj g|=  4.37725D-02

At iterate   35    f=  1.08752D+01    |proj g|=  3.42860D-02

At iterate   40    f=  1.08706D+01    |proj g|=  1.05500D-02
  ys=-3.421E-01  -gs= 4.652E-03 BFGS update SKIPPED

At iterate   45    f=  1.07212D+01    |proj g|=  6.72501D-02

At iterate   50    f=  1.07161D+01    |proj g|=  1.36648D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations




In [8]:
forecasts = forecasts.reset_index()
forecasts.columns = ['station_complex_id', 'created_date', 'predicted_mean', 'lower_bound', 'upper_bound']
forecasts.set_index('created_date', inplace=True)
data_and_predictions = pd.concat([mta_subway_by_station, forecasts], axis=0)

In [9]:
mta_subway_df_unique = mta_subway_df.reset_index()[["station_complex_id", "station_complex", "latitude", "longitude"]].drop_duplicates(subset=["station_complex_id"])

In [10]:
final_data_to_save = pd.merge(data_and_predictions.reset_index(), mta_subway_df_unique, on="station_complex_id", how="left")

In [11]:
final_data_to_save.tail()

Unnamed: 0,created_date,station_complex_id,number_of_riderships,predicted_mean,lower_bound,upper_bound,station_complex,latitude,longitude
665,2024-10-27,610,,44786.563292,30625.658088,58947.468495,"Grand Central-42 St (S,4,5,6,7)",40.751778,-73.976845
666,2024-10-28,610,,65033.626645,50872.606416,79194.646875,"Grand Central-42 St (S,4,5,6,7)",40.751778,-73.976845
667,2024-10-29,610,,97517.407805,82429.587484,112605.228125,"Grand Central-42 St (S,4,5,6,7)",40.751778,-73.976845
668,2024-10-30,610,,105677.729576,90424.429542,120931.02961,"Grand Central-42 St (S,4,5,6,7)",40.751778,-73.976845
669,2024-10-31,610,,105719.562226,90434.714653,121004.4098,"Grand Central-42 St (S,4,5,6,7)",40.751778,-73.976845


## Save Predictions

In [12]:
final_data_to_save.dtypes

created_date            datetime64[ns]
station_complex_id               int64
number_of_riderships           float64
predicted_mean                 float64
lower_bound                    float64
upper_bound                    float64
station_complex                 object
latitude                       float64
longitude                      float64
dtype: object

In [13]:
save_dataframe_to_s3(final_data_to_save, bucket_name, 'predictions/riderships_predictions.csv')

DataFrame successfully saved to s3://mta-subway/predictions/riderships_predictions.csv
