In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import csv
from sklearn.metrics import mean_absolute_error
from datetime import datetime
import pickle
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima

In [36]:
df = pd.DataFrame(columns=['Date', 'Incident Type', 'MAE'])
df.to_csv('mae_tracking.csv', index=False)


In [15]:
data = pd.read_csv("../data/data_v0.2_intermediate(for checking)_with_status.csv")
incident_types = data["Incidents"].unique()

In [39]:
def fetch_data(incident_type=False):
    # to edit to load from sql server
    data = pd.read_csv("../data/data_v0.2_intermediate(for checking)_with_status.csv")
    if incident_type:
        data = data.loc[data['Incidents'] == incident_type]
    return data
all = fetch_data()


Unnamed: 0,IncidentID,Open,Add,Close,Year,DayOfYear,Month,DayOfWeek,time_period,Building,Location,Latitude,Longitude,Incidents,User
0,1,2019-02-13 13:05:00,2.701401,2019-02-16 05:55:01,2019,44,2,2,day_weekday,SRC MINI GRAND STAND,YIH,1.298646,103.77774,LOST AND FOUND,sec3
1,2,2019-03-02 13:55:00,2.455224,2019-03-05 00:50:31,2019,61,3,5,weekend,PGP RESIDENCES (BLOCK 13),PGP,1.290868,103.780276,LOST AND FOUND,sec3
2,3,2019-03-18 03:05:00,2.855855,2019-03-20 23:37:26,2019,77,3,0,day_weekday,PGP RESIDENCES CANTEEN 2,PGP,1.29062,103.781695,LOST AND FOUND,sec3
3,4,2018-11-16 16:42:00,2.284904,2018-11-18 23:32:16,2018,320,11,4,day_weekday,"KENT VALE APARTMENT BLK G, 115",KV,1.302494,103.770315,LOST AND FOUND,sec4
4,5,2018-09-28 14:03:00,6.85828,2018-10-05 10:38:55,2018,271,9,4,day_weekday,YALE FITNESS CENTER,UTown,1.307479,103.772585,DAMAGED PROPERTY,sec1


In [10]:
def engineer_features(data):
    data['Open'] = pd.to_datetime(data['Open'])
    data.sort_values('Open', inplace=True)

    data.set_index('Open', inplace=True)

    weekly_data = data['IncidentID'].resample('W').count()

    last_date = weekly_data.index[-1]

    split_date_1_year = last_date - pd.DateOffset(years=1)

    train = weekly_data.loc[weekly_data.index <= split_date_1_year]
    test = weekly_data.loc[weekly_data.index > split_date_1_year]

    return weekly_data, train, test

In [40]:
weekly_data, train, test = engineer_features(all)

In [25]:
def train_and_evaluate(model, train, test):
    model.fit(train)

    predictions = model.predict(n_periods=len(test))

    mae = mean_absolute_error(test, predictions)

    return model, mae

def save_model(model, filename):
    with open(filename, 'wb') as pkl:
        pickle.dump(model, pkl)

In [None]:
for incident_type in incident_types:
    data = fetch_data(incident_type)
    weekly_data, train, test = engineer_features(data)

    model = auto_arima(weekly_incidents,
                        start_p=0, start_q=0, 
                        max_p=0, max_q=0,     
                        m=52,                 
                        start_P=0, start_Q=0, 
                        max_P=2, max_Q=2,     
                        seasonal=True,        
                        d=0,                  
                        D=1,                  
                        trace=True,           
                        error_action='ignore',  
                        suppress_warnings=True, 
                        stepwise=True)

    trained_model, mae = train_and_evaluate(model, train, test)

    mae_df = pd.read_csv('mae_tracking.csv')
    mae_df.loc[len(mae_df)] = [incident_type, mae]
    mae_df.to_csv('mae_tracking.csv', index=False)

    trained_model.fit(weekly_data)
    save_model(trained_model, f'sarima_model_{incident_type}.pkl')

In [21]:
incident_type = "LOST AND FOUND"
data = fetch_data(incident_type)
weekly_data, train, test = engineer_features(data)

model = auto_arima(train,
                    start_p=0, start_q=0, 
                    max_p=0, max_q=0,     
                    m=52,                 
                    start_P=0, start_Q=0, 
                    max_P=2, max_Q=2,     
                    seasonal=True,        
                    d=0,                  
                    D=1,                  
                    trace=True,           
                    error_action='ignore',  
                    suppress_warnings=True, 
                    stepwise=True)

trained_model, mae = train_and_evaluate(model, train, test)

mae_df = pd.read_csv('mae_tracking.csv')
mae_df.loc[len(mae_df)] = [datetime.now(),incident_type, mae]
mae_df.to_csv('mae_tracking.csv', index=False)

trained_model.fit(weekly_data)
save_model(trained_model, f'sarima_model_{incident_type}.pkl')

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,1,0)[52] intercept   : AIC=1193.538, Time=0.34 sec
 ARIMA(0,0,0)(1,1,0)[52] intercept   : AIC=1158.547, Time=2.54 sec
 ARIMA(0,0,0)(0,1,1)[52] intercept   : AIC=inf, Time=7.39 sec
 ARIMA(0,0,0)(0,1,0)[52]             : AIC=1198.328, Time=0.22 sec
 ARIMA(0,0,0)(2,1,0)[52] intercept   : AIC=1150.969, Time=10.25 sec
 ARIMA(0,0,0)(3,1,0)[52] intercept   : AIC=inf, Time=32.32 sec
 ARIMA(0,0,0)(2,1,1)[52] intercept   : AIC=inf, Time=37.51 sec
 ARIMA(0,0,0)(1,1,1)[52] intercept   : AIC=inf, Time=12.48 sec
 ARIMA(0,0,0)(3,1,1)[52] intercept   : AIC=inf, Time=34.70 sec
 ARIMA(0,0,0)(2,1,0)[52]             : AIC=1164.676, Time=5.82 sec

Best model:  ARIMA(0,0,0)(2,1,0)[52] intercept
Total fit time: 143.727 seconds


NameError: name 'train_and_evaluate' is not defined

In [23]:
trained_model, mae = train_and_evaluate(model, train, test)

mae_df = pd.read_csv('mae_tracking.csv')
mae_df.loc[len(mae_df)] = [datetime.now(),incident_type, mae]
mae_df.to_csv('mae_tracking.csv', index=False)



ValueError: could not convert string to float: '2018-08-07 09:26:34'

In [24]:
trained_model.fit(weekly_data)


NameError: name 'save_model' is not defined

In [28]:
save_model(trained_model, f'sarima_model_{incident_type}.pkl')

In [31]:
def forecast(incident_type):
    with open(f'sarima_model_{incident_type}.pkl', 'rb') as pkl:
        model = pickle.load(pkl)
    
    future_forecast = model.predict(n_periods=52)
    
    plt.figure(figsize=(10, 5))
    plt.plot(future_forecast, color='blue')
    plt.title('Future Forecast for 52 Weeks')
    plt.xlabel('Time')
    plt.ylabel('Predicted Count')
    forecast_plot = 'forecast_plot.png'
    plt.savefig(forecast_plot)
    plt.close()