# imports

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import pmdarima as pm
import warnings
from pmdarima.arima import ARIMA
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [2]:
warnings.filterwarnings("ignore")

# loading Dataset

In [3]:
INPUT_PATH = 'data/labels.parquet'
OUTPUT_PATH = 'data/arima_predict.parquet'

In [4]:
def load_data(path):
    dataset = pd.read_parquet(path, engine = 'pyarrow') 
    return dataset

In [5]:
rides_df = load_data(INPUT_PATH)

# Preprocessing 

In [6]:
def preprocessing(rides_df):
    loc_ts={}
    demand=[]
    pre_Location=1
    
    for i in range(len(rides_df)):
        
      if rides_df.Location[i]!=pre_Location:
        loc_ts[f'LocationID_{pre_Location}']=demand
        demand=[]
          
      demand.append(rides_df.Demand[i])
      pre_Location = rides_df.Location[i]
        
    loc_ts[f'LocationID_{pre_Location}'] = demand
    location_labels_df = pd.DataFrame(loc_ts)
    
    return location_labels_df

In [7]:
location_labels_df = preprocessing(rides_df)
print(f'location_labels_df shape : {location_labels_df.shape}')
location_labels_df.head()

location_labels_df shape : (120, 262)


Unnamed: 0,LocationID_1,LocationID_2,LocationID_3,LocationID_4,LocationID_5,LocationID_6,LocationID_7,LocationID_8,LocationID_9,LocationID_10,...,LocationID_256,LocationID_257,LocationID_258,LocationID_259,LocationID_260,LocationID_261,LocationID_262,LocationID_263,LocationID_264,LocationID_265
0,40.0,0.0,0.0,174.0,3.0,1.0,126.0,1.0,0.0,47.0,...,125.0,5.0,5.0,1.0,48.0,510.0,837.0,1942.0,1176.0,124.0
1,31.0,0.0,2.0,32.0,2.0,3.0,39.0,0.0,2.0,36.0,...,13.0,2.0,4.0,1.0,15.0,432.0,758.0,1328.0,1110.0,59.0
2,27.0,0.0,2.0,51.0,2.0,1.0,48.0,0.0,1.0,53.0,...,17.0,2.0,5.0,0.0,14.0,338.0,1344.0,1746.0,1249.0,54.0
3,7.0,0.0,1.0,43.0,2.0,1.0,38.0,0.0,1.0,46.0,...,17.0,2.0,4.0,5.0,22.0,432.0,1479.0,1967.0,1294.0,51.0
4,15.0,0.0,2.0,42.0,2.0,5.0,49.0,1.0,3.0,43.0,...,12.0,2.0,3.0,4.0,20.0,436.0,1552.0,2035.0,1393.0,45.0


# ARIMA MODEL

## train 

In [20]:
column = location_labels_df.columns
train_size_ratio = 0.12

In [9]:
def split_data(location_labels_df, location, train_size_ratio):
    size = int(len(location_labels_df) * train_size_ratio)
    train_data = location_labels_df.loc[0:size, location]
    test_data = location_labels_df.loc[size:, location]
    return (train_data, test_data)

In [10]:
def arima_forecast(train_data, test_data):
    history = [x for x in train_data]
    predictions = []
    model = pm.arima.auto_arima(
                              history, start_p=1, start_q=1,
                              test='kpss', max_p=8, max_q=8,
                              seasonal=False, m=1,
                              d=None, start_P=0,
                              suppress_warnings=False, trace=False)
    for t in range(len(test_data)):
        best_arima_model_fit = pm.arima.ARIMA(order=model.get_params().get("order")).fit(history)
        output = best_arima_model_fit.predict(n_periods=1)
        yhat = output[0]
        predictions.append(int(yhat))
        obs = test_data.iloc[t]
        history.append(obs)
    
    test_data = test_data.reset_index()
    return (test_data.iloc[:,1], predictions,
          history, best_arima_model_fit.fittedvalues())


train and predict for all locationIDs


In [11]:
def rmse(test_data, predictions):
  rmse = sqrt(mean_squared_error(test_data, predictions))
  return rmse

In [12]:
def mape(test_data, predictions):
  mape = mean_absolute_percentage_error(test_data, predictions)
  return mape

In [19]:
def predict_all_location(location_labels_df, column, train_size_ratio):
    all_loc_rmse = []
    all_loc_mape = []
    predicted_data_all_loc = {}
    test_data_all_loc = {}
    fitted_data_all_loc = {}
    for loc in column:
        train_data, test_data = split_data (location_labels_df, loc, train_size_ratio)
        result = arima_forecast(train_data, test_data)
        val_rmse = rmse (result[0], result[1])
        val_mape = mape (result[0], result[1])
        all_loc_rmse.append(val_rmse)
        all_loc_mape.append(val_mape)
        test_data_all_loc[loc] = result[0]
        predicted_data_all_loc[loc] = result[1]
        fitted_data_all_loc[loc] = result[3]
    
    test_data_all_loc = pd.DataFrame(test_data_all_loc)
    fitted_data_all_loc = pd.DataFrame(fitted_data_all_loc)
    predicted_data_all_loc = pd.DataFrame(predicted_data_all_loc)
    return (test_data_all_loc, predicted_data_all_loc, fitted_data_all_loc, all_loc_rmse, all_loc_mape)

In [14]:
test_data_all_loc, predicted_data_all_loc, fitted_data_all_loc, all_loc_rmse, all_loc_mape = predict_all_location(location_labels_df, column, train_size_ratio)

## evaluation

In [15]:
def evaluation(test_data_all_loc, predicted_data_all_loc):
    predicted_all_location = predicted_data_all_loc.sum(axis=1)/len(predicted_data_all_loc)
    test_all_location = test_data_all_loc.sum(axis=1)/len(test_data_all_loc)
    mape_error = mape(test_all_location,predicted_all_location)
    rmse_error = rmse(test_all_location,predicted_all_location)
    print(f'mape: {mape_error}')
    print(f'rmse: {rmse_error}')

In [16]:
evaluation(test_data_all_loc, predicted_data_all_loc)

mape: 0.06724386070445987
rmse: 79.4183008629171


# Save file

In [17]:
def save_val_predicted(dataset, path):
    labels_df = dataset.to_parquet(path, index=False)

In [18]:
save_val_predicted(predicted_data_all_loc, OUTPUT_PATH)