# imports

In [2]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import pmdarima as pm
import warnings
from pmdarima.arima import ARIMA
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [3]:
warnings.filterwarnings("ignore")

# loading Dataset

In [2]:
INPUT_PATH = 'data/labels.parquet'
OUTPUT_PATH = 'data/arima_predict.parquet'

In [3]:
def load_data(path):
    dataset = pd.read_parquet(path, engine = 'pyarrow') 
    return dataset

In [4]:
rides_df = load_data(INPUT_PATH)

# Preprocessing 

In [5]:
def preprocessing(rides_df):
    loc_ts={}
    demand=[]
    pre_Location=1
    
    for i in range(len(rides_df)):
        
      if rides_df.Location[i]!=pre_Location:
        loc_ts[f'LocationID_{pre_Location}']=demand
        demand=[]
          
      demand.append(rides_df.Demand[i])
      pre_Location = rides_df.Location[i]
        
    loc_ts[f'LocationID_{pre_Location}'] = demand
    loc_labels_df = pd.DataFrame(loc_ts)
    
    return loc_labels_df

In [6]:
loc_labels_df = preprocessing(rides_df)
print(f'loc_labels_df shape : {loc_labels_df.shape}')
loc_labels_df.head()

loc_labels_df shape : (120, 262)


Unnamed: 0,LocationID_1,LocationID_2,LocationID_3,LocationID_4,LocationID_5,LocationID_6,LocationID_7,LocationID_8,LocationID_9,LocationID_10,...,LocationID_256,LocationID_257,LocationID_258,LocationID_259,LocationID_260,LocationID_261,LocationID_262,LocationID_263,LocationID_264,LocationID_265
0,40.0,0.0,0.0,174.0,3.0,1.0,126.0,1.0,0.0,47.0,...,125.0,5.0,5.0,1.0,48.0,510.0,837.0,1942.0,1176.0,124.0
1,31.0,0.0,2.0,32.0,2.0,3.0,39.0,0.0,2.0,36.0,...,13.0,2.0,4.0,1.0,15.0,432.0,758.0,1328.0,1110.0,59.0
2,27.0,0.0,2.0,51.0,2.0,1.0,48.0,0.0,1.0,53.0,...,17.0,2.0,5.0,0.0,14.0,338.0,1344.0,1746.0,1249.0,54.0
3,7.0,0.0,1.0,43.0,2.0,1.0,38.0,0.0,1.0,46.0,...,17.0,2.0,4.0,5.0,22.0,432.0,1479.0,1967.0,1294.0,51.0
4,15.0,0.0,2.0,42.0,2.0,5.0,49.0,1.0,3.0,43.0,...,12.0,2.0,3.0,4.0,20.0,436.0,1552.0,2035.0,1393.0,45.0


# ARIMA MODEL

## train 

In [7]:
col = loc_labels_df.columns
train_size_ratio = 0.2

In [8]:
def split_data(loc_labels_df, location, train_size_ratio):
    size = int(len(loc_labels_df) * train_size_ratio)
    val_tr = loc_labels_df.loc[0:size, location]
    val_te = loc_labels_df.loc[size:, location]
    return (val_tr, val_te)

In [9]:
def arima_forecast(val_tr, val_te):
    history = [x for x in val_tr]
    predictions = []
    model = pm.arima.auto_arima(
                              history, start_p=1, start_q=1,
                              test='kpss', max_p=8, max_q=8,
                              seasonal=False, m=1,
                              d=None, start_P=0,
                              suppress_warnings=False, trace=False)
    for t in range(len(val_te)):
        best_arima_model_fit = pm.arima.ARIMA(order=model.get_params().get("order")).fit(history)
        output = best_arima_model_fit.predict(n_periods=1)
        yhat = output[0]
        predictions.append(int(yhat))
        obs = val_te.iloc[t]
        history.append(obs)
    
    val_te = val_te.reset_index()
    return (val_te.iloc[:,1], predictions,
          history, best_arima_model_fit.fittedvalues())


train and predict for all locationIDs


In [10]:
def rmse(val_te, predictions):
  rmse = sqrt(mean_squared_error(val_te, predictions))
  return rmse

In [11]:
def mape(val_te, predictions):
  mape = mean_absolute_percentage_error(val_te, predictions)
  return mape

In [12]:
def predict_all_location(loc_labels_df, col, train_size_ratio):
    all_loc_rmse = []
    all_loc_mape = []
    val_predicted = {}
    val_test = {}
    val_fit = {}
    for loc in col:
        val_tr, val_te = split_data (loc_labels_df, loc, train_size_ratio)
        result = arima_forecast(val_tr, val_te)
        val_rmse = rmse (result[0], result[1])
        val_mape = mape (result[0], result[1])
        all_loc_rmse.append(val_rmse)
        all_loc_mape.append(val_mape)
        val_test[loc] = result[0]
        val_predicted[loc] = result[1]
        val_fit[loc] = result[3]
    
    val_test = pd.DataFrame(val_test)
    val_fit = pd.DataFrame(val_fit)
    val_predicted = pd.DataFrame(val_predicted)
    return (val_test, val_predicted, val_fit, all_loc_rmse, all_loc_mape)

In [None]:
val_test, val_predicted, val_fit, all_loc_rmse, all_loc_mape = predict_all_location(loc_labels_df, col, train_size_ratio)

## evaluation

In [None]:
def minmax_error(val_predicted, val_test):
    error = val_predicted-val_test
    min_max_error = pd.concat([error.abs().min(axis=0),error.abs().max(axis=0)], axis=1,sort=False)
    min_max_error.columns = ['min', 'max']
    
    return min_max_error

In [None]:
min_max_error = minmax_error(val_predicted, val_test)
min_max_error.head()

# Save file

In [24]:
def save_val_predicted(dataset, path):
    labels_df = dataset.to_parquet(path, index=False)

In [25]:
save_val_predicted(val_predicted, OUTPUT_PATH)