### Analysis of pmdarima (Python's auto\_arima implementation) performance with COVID19 data from Brazil


In [1]:
import pmdarima as pmd
import numpy as np
import pandas as pd

In [2]:
filename = 'covid.csv'
full_df = pd.read_csv(filename, low_memory=False)

In [3]:
def filter_by_state(df, state, filter_zero=False):
    """
    Returns a filtered dataframe by state

    Logic: Doesn't have "codmun" (county code) and match "estado" (state)
    """
    filt_df = df[df['codmun'].isnull() & (df['estado'] == state)]
    if filter_zero:
        filt_df = filt_df[filt_df['casosAcumulado'] != 0]
    filt_df = filt_df.drop_duplicates('data')
    return filt_df


def filter_brazil(df, filter_zero=False):
    """
    Returns a filtered dataframe for Brazil

    Logic: "regiao" (region) equals 'Brasil'
    """
    filt_df = df[df['regiao'] == 'Brasil']
    if filter_zero:
        filt_df = filt_df[filt_df['casosAcumulado'] != 0]
    filt_df = filt_df.drop_duplicates('data')
    return filt_df

In [4]:
def predict(df, start_date, end_date, pred_days):
    """
    Predicts COVID-19 cumulative cases using auto_arima

    Args:
        date_limit (str): Example: 2020-04-19 (year-month-day)

    Returns:
        Dict with with following keys: prediction, smape, state and arima_order
    """
    df['data'] = pd.to_datetime(df['data'], format='%Y-%m-%d')
    df.set_index('data', inplace=True)
    cases_array = df.loc[start_date:end_date]['casosAcumulado'].values
    train, test = cases_array[:-pred_days], cases_array[-pred_days:]
    fit = pmd.auto_arima(train.astype('float32'))
    prediction = fit.predict(pred_days)
    res = pmd.metrics.smape(test, prediction)
    return {
        'prediction': prediction,
        'smape': res,
        'arima_order': fit.order
    }

### Short-term forecasting COVID-19 cumulative confirmed cases: Perspectives for Brazil
https://www.sciencedirect.com/science/article/pii/S0960077920302538

In [5]:
from collections import namedtuple

StateData = namedtuple('StateData', ['state', 'start_date', 'end_date'])

states_data = [
    StateData(state='AM', start_date='2020-03-13', end_date='2020-04-19'),
    StateData(state='BA', start_date='2020-03-06', end_date='2020-04-19'),
    StateData(state='CE', start_date='2020-03-16', end_date='2020-04-19'),
    StateData(state='MG', start_date='2020-03-08', end_date='2020-04-19'),
    StateData(state='PR', start_date='2020-03-12', end_date='2020-04-18'),
    StateData(state='RJ', start_date='2020-03-05', end_date='2020-04-19'),
    StateData(state='RN', start_date='2020-03-12', end_date='2020-04-18'),
    StateData(state='RS', start_date='2020-03-10', end_date='2020-04-19'),
    StateData(state='SC', start_date='2020-03-12', end_date='2020-04-19'),
    StateData(state='SP', start_date='2020-02-25', end_date='2020-04-19')
]

In [6]:
print('Results for 1 day')
for state_data in states_data:
    state_df = filter_by_state(full_df, state_data.state)
    res = predict(state_df, state_data.start_date, state_data.end_date, pred_days=1)
    print('%s - ARIMA%s - sMAPE: %s' % (state_data.state, res['arima_order'], res['smape']))

Results for 1 day
AM - ARIMA(1, 2, 0) - sMAPE: 2.853771123088597
BA - ARIMA(0, 2, 1) - sMAPE: 7.024365409817703
CE - ARIMA(1, 2, 0) - sMAPE: 3.6361667360872687
MG - ARIMA(0, 2, 3) - sMAPE: 2.873270035554388
PR - ARIMA(0, 2, 1) - sMAPE: 4.088562098078861
RJ - ARIMA(2, 2, 2) - sMAPE: 1.7162745845322167
RN - ARIMA(0, 2, 1) - sMAPE: 4.347176624348275
RS - ARIMA(0, 2, 3) - sMAPE: 1.0041421940773971
SC - ARIMA(0, 2, 1) - sMAPE: 2.6997299797208862
SP - ARIMA(0, 2, 1) - sMAPE: 3.660998626710932


In [7]:
print('Results for 3 days')
for state_data in states_data:
    state_df = filter_by_state(full_df, state_data.state)
    res = predict(state_df, state_data.start_date, state_data.end_date, pred_days=3)
    print('%s - ARIMA%s - sMAPE: %s' % (state_data.state, res['arima_order'], res['smape']))

Results for 3 days
AM - ARIMA(1, 2, 0) - sMAPE: 1.3183252603310427
BA - ARIMA(2, 2, 0) - sMAPE: 2.203271727286451
CE - ARIMA(2, 2, 0) - sMAPE: 6.487606123636027
MG - ARIMA(0, 2, 2) - sMAPE: 4.4317986104711125
PR - ARIMA(0, 2, 1) - sMAPE: 2.91911589673431
RJ - ARIMA(1, 2, 2) - sMAPE: 1.5306286737158497
RN - ARIMA(1, 2, 1) - sMAPE: 7.069237053244531
RS - ARIMA(0, 2, 3) - sMAPE: 0.5155951693431876
SC - ARIMA(0, 2, 1) - sMAPE: 1.1454918156355303
SP - ARIMA(2, 2, 1) - sMAPE: 10.139626416451057


In [8]:
print('Predictions for 6 days')
for state_data in states_data:
    state_df = filter_by_state(full_df, state_data.state)
    res = predict(state_df, state_data.start_date, state_data.end_date, pred_days=6)
    print('%s - ARIMA%s - sMAPE: %s' % (state_data.state, res['arima_order'], res['smape']))

Predictions for 6 days
AM - ARIMA(1, 2, 0) - sMAPE: 7.092452040482091
BA - ARIMA(2, 2, 2) - sMAPE: 11.56505517325585
CE - ARIMA(0, 2, 2) - sMAPE: 17.760496276764385
MG - ARIMA(2, 2, 0) - sMAPE: 6.453437760618954
PR - ARIMA(0, 2, 1) - sMAPE: 8.34343040734601
RJ - ARIMA(1, 2, 3) - sMAPE: 3.03632740534907
RN - ARIMA(1, 1, 0) - sMAPE: 19.62120992742239
RS - ARIMA(2, 2, 1) - sMAPE: 11.05747133723962
SC - ARIMA(0, 2, 1) - sMAPE: 3.4715926249892965
SP - ARIMA(1, 2, 1) - sMAPE: 19.753858814467925


### Short-term forecasting of daily COVID-19 cases in Brazil by using the Holt’s model

https://www.scielo.br/scielo.php?script=sci_arttext&pid=S0037-86822020000100643

In [9]:
states = ['SP', 'MG', 'RJ']
start_date='2020-01-01'
end_date='2020-05-03'

print('Predictions for 8 days')

# Results for Brazil
br_df = filter_brazil(full_df)
res = predict(br_df, start_date, end_date, pred_days=8)
print('BR - ARIMA%s - sMAPE: %s' % (res['arima_order'], res['smape']))


for state in states:
    state_df = filter_by_state(full_df, state, filter_zero=True)
    res = predict(state_df, start_date, end_date, pred_days=8)
    print('%s - ARIMA%s - sMAPE: %s' % (state, res['arima_order'], res['smape']))

Predictions for 8 days
BR - ARIMA(1, 2, 0) - sMAPE: 1.6804520500749773
SP - ARIMA(1, 2, 1) - sMAPE: 5.623936162071448
MG - ARIMA(0, 2, 3) - sMAPE: 3.6264291829236317
RJ - ARIMA(2, 2, 3) - sMAPE: 8.321438703701347


##### Calculating sMAPE by the values given in the article

In [10]:
brazil_real = [61888, 66501, 71886, 78162, 85380, 91589, 96559, 101147]
brazil_forecasted = [63598.77, 68898.82, 74198.86, 79498.91, 84798.95, 90099.00, 95399.05, 100699.09]
print('sMAPE for BR:', pmd.metrics.smape(brazil_real, brazil_forecasted))

SP_real = [20715, 21696, 24041, 26158, 28698, 30374, 31174, 31772]
SP_forecasted = [21288.91, 22573.96, 23859.01, 25144.06, 26429.11, 27714.16, 28999.21, 30284.26]
print('sMAPE for SP:', pmd.metrics.smape(SP_real, SP_forecasted))

MG_real = [1548, 1586, 1649, 1758, 1827, 1935, 2023, 2118]
MG_forecasted = [1537.80, 1600.34, 1662.89, 1725.44, 1787.99, 1850.54, 1913.09, 1975.64]
print('sMAPE for MG:', pmd.metrics.smape(MG_real, MG_forecasted))

RJ_real = [7111, 7944, 8504, 8869, 9453, 10166, 10546, 11139]
RJ_forecasted = [7169.33, 7570.99, 7972.66, 8374.33, 8775.99, 9177.66, 9579.32, 9980.99]
print('sMAPE for RJ:', pmd.metrics.smape(RJ_real, RJ_forecasted))

sMAPE for BR: 1.888280586913778
sMAPE for SP: 5.1030513515506275
sMAPE for MG: 2.928723180486233
sMAPE for RJ: 7.003955308533456
