In [None]:
import pandas as pd

df = pd.read_csv('/dataset/2015-2024-monthly-tourist-arrivals-sl-csv.csv')
df.info()

In [None]:
df.head()

In [None]:
df.drop(df.columns[3], axis=1, inplace=True)
df.head()

In [None]:
from datetime import datetime

month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

df['Date'] = df.apply(lambda row: datetime(row['Year'], month_map[row['Month']], 1), axis=1)

df_v1 = df[['Date', 'Arrivals']].copy()

df_v1.loc[df_v1['Date'].dt.month == 2, 'Date'] = df_v1['Date'].apply(
    lambda x: x.replace(day=28) if not (x.year % 4 == 0 and (x.year % 100 != 0 or x.year % 400 == 0)) else x.replace(day=29)
)

df_v1.set_index('Date', inplace=True)

df_v1.head()


In [None]:
df_v1.isnull().values.any()

In [None]:
df_v1.describe()

In [None]:
time_series = df_v1[['Arrivals']]
time_series.tail()

In [None]:
import matplotlib.pylab as plt
plt.plot(time_series)
plt.ylabel('Total Number of Tourists Arrivals')
plt.grid()
plt.tight_layout()
plt.show()

In [9]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries):

    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value

    critical_value = dftest[4]['5%']
    test_statistic = dftest[0]
    alpha = 1e-3
    pvalue = dftest[1]
    if pvalue < alpha and test_statistic < critical_value:
        print("X is stationary")
        return True
    else:
        print("X is not stationary")
        return False

In [None]:
time_series_diff = time_series['Arrivals']
d = 0
while test_stationarity(time_series_diff) is False:
    time_series_diff = time_series_diff.diff().dropna()
    d = d + 1

In [None]:
d

In [None]:
import statsmodels.api as sm

ts_cycle, ts_trend = sm.tsa.filters.hpfilter(time_series['Arrivals'])
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(ts_trend, lags =12)
plt.show()
plot_pacf(ts_trend, lags =12)
plt.show()

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

p = 9
q = 1
model = SARIMAX(time_series, order=(p,d,q))
model_fit = model.fit(disp=1,solver='powell')

fcast = model_fit.get_prediction(start=1, end=len(time_series))
ts_p = fcast.predicted_mean
ts_ci = fcast.conf_int()


plt.show()
plt.plot(ts_p,label='prediction')
plt.plot(time_series,color='red',label='actual')
plt.fill_between(ts_ci.index[1:],
                ts_ci.iloc[1:, 0],
                ts_ci.iloc[1:, 1], color='k', alpha=.2)

plt.ylabel('Total Number of Tourists Arrivals')
plt.legend()
plt.tight_layout()
plt.grid()
plt.show()