In [74]:

import pandas as pd
import numpy as np
from pandas import read_csv
from pmdarima.arima import auto_arima
from datetime import datetime
import matplotlib.pyplot as plt
from pmdarima.arima import ADFTest

In [75]:
values = read_csv("../../Time_series_data.csv")

In [76]:
print(values.shape[0])

105


In [77]:
x = values['Month'][values.shape[0]-1]
print(x)

2021-09


In [78]:
values['Month'] = pd.to_datetime(values['Month'], errors = 'coerce')
values.set_index('Month', inplace=True)

In [79]:
print(values)

             Sales
Month             
2013-01-01  2815.0
2013-02-01  2672.0
2013-03-01  2755.0
2013-04-01     NaN
2013-05-01     NaN
...            ...
2021-05-01  4618.0
2021-06-01  5312.0
2021-07-01  4298.0
2021-08-01  1413.0
2021-09-01  5877.0

[105 rows x 1 columns]


In [80]:
values['Sales'].fillna(values['Sales'].mean(), inplace=True)

In [81]:
values

Unnamed: 0_level_0,Sales
Month,Unnamed: 1_level_1
2013-01-01,2815.000000
2013-02-01,2672.000000
2013-03-01,2755.000000
2013-04-01,4798.582524
2013-05-01,4798.582524
...,...
2021-05-01,4618.000000
2021-06-01,5312.000000
2021-07-01,4298.000000
2021-08-01,1413.000000


In [26]:
missing_values_count = values.isna().sum().sum()
if missing_values_count != 0:
    values = values[~(values.isna().any(axis=1))]
    print(values)

             Sales
Month             
2013-01-01  2815.0
2013-02-01  2672.0
2013-03-01  2755.0
2013-06-01  3036.0
2013-07-01  2282.0
...            ...
2021-05-01  4618.0
2021-06-01  5312.0
2021-07-01  4298.0
2021-08-01  1413.0
2021-09-01  5877.0

[103 rows x 1 columns]


In [27]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(values, test_size=0.2, shuffle=False)

arima_model = auto_arima(train, start_p=0, d=1, start_q=0,
                        max_p=5, max_d=5, max_q=5, start_P=0,
                        D=1, start_Q=0, max_P=5, max_D=5,
                        max_Q=5, m=12, seasonal=True,
                        error_action='warn', trace=True,
                        supress_warnings=True, stepwise=True,
                        random_state=20, n_fits=50)

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=1152.595, Time=0.04 sec


 ARIMA(1,1,0)(1,1,0)[12]             : AIC=1139.407, Time=0.37 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=1121.789, Time=0.27 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=1120.064, Time=0.08 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=1121.650, Time=0.22 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=1122.862, Time=0.51 sec
 ARIMA(1,1,1)(0,1,0)[12]             : AIC=1121.748, Time=0.13 sec
 ARIMA(0,1,2)(0,1,0)[12]             : AIC=1121.705, Time=0.14 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=1140.055, Time=0.05 sec
 ARIMA(1,1,2)(0,1,0)[12]             : AIC=1121.772, Time=0.35 sec
 ARIMA(0,1,1)(0,1,0)[12] intercept   : AIC=inf, Time=0.12 sec

Best model:  ARIMA(0,1,1)(0,1,0)[12]          
Total fit time: 2.292 seconds


In [32]:
test.index

DatetimeIndex(['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01',
               '2020-05-01', '2020-06-01', '2020-07-01', '2020-08-01',
               '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01',
               '2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01',
               '2021-05-01', '2021-06-01', '2021-07-01', '2021-08-01',
               '2021-09-01'],
              dtype='datetime64[ns]', name='Month', freq=None)

In [28]:
prediction = pd.DataFrame(
    arima_model.predict(n_periods=30),index=test.index )
prediction.columns = ['predicted_values']
# prediction.reset_index(inplace=True)



In [29]:
print(prediction)

            predicted_values
Month                       
2020-01-01               NaN
2020-02-01               NaN
2020-03-01               NaN
2020-04-01               NaN
2020-05-01               NaN
2020-06-01               NaN
2020-07-01               NaN
2020-08-01               NaN
2020-09-01               NaN
2020-10-01               NaN
2020-11-01               NaN
2020-12-01               NaN
2021-01-01               NaN
2021-02-01               NaN
2021-03-01               NaN
2021-04-01               NaN
2021-05-01               NaN
2021-06-01               NaN
2021-07-01               NaN
2021-08-01               NaN
2021-09-01               NaN


In [30]:
index_future_dates = pd.date_range(
    start=x, end='2023-01-1', freq='MS')

In [31]:
prediction_arima = pd.DataFrame(
    arima_model.predict(n_periods=26), index = index_future_dates)
prediction_arima.columns = ['predicted_values']  
print(prediction_arima)

            predicted_values
2021-09-01               NaN
2021-10-01               NaN
2021-11-01               NaN
2021-12-01               NaN
2022-01-01               NaN
2022-02-01               NaN
2022-03-01               NaN
2022-04-01               NaN
2022-05-01               NaN
2022-06-01               NaN
2022-07-01               NaN
2022-08-01               NaN
2022-09-01               NaN
2022-10-01               NaN
2022-11-01               NaN
2022-12-01               NaN
2023-01-01               NaN


