In [1]:
import pandas as pd
import matplotlib.pylab as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
# Load dataset
df = pd.read_csv('/dataset/2015-2024-monthly-tourist-arrivals-sl-csv.csv')
df.head()

In [None]:
# Check for missing values and fill them if necessary
df_v1 = df[['Arrivals']].dropna()

In [None]:
# Plot the time series data to visualize trends
plt.plot(df_v1)
plt.ylabel('Total Number of Tourists Arrivals')
plt.grid()
plt.tight_layout()
plt.show()

# Function to check stationarity with ADF test
def test_stationarity(timeseries):
    dftest = adfuller(timeseries)
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value

    print(dfoutput)
    
    critical_value = dftest[4]['5%']
    test_statistic = dftest[0]
    pvalue = dftest[1]
    
    if pvalue < 0.05 and test_statistic < critical_value:
        print("The series is stationary.")
        return True
    else:
        print("The series is not stationary.")
        return False

# Test for stationarity and apply differencing if necessary
time_series_diff = df_v1['Arrivals']
d = 0
while not test_stationarity(time_series_diff):
    time_series_diff = time_series_diff.diff().dropna()
    d += 1

print(f'Differencing order: {d}')

# Plot ACF and PACF to determine p and q
plot_acf(time_series_diff, lags=20)
plt.show()

plot_pacf(time_series_diff, lags=20)
plt.show()

# Split the dataset into train and test sets
train_size = int(len(df_v1) * 0.8)
train, test = df_v1[:train_size], df_v1[train_size:]

# Set SARIMA parameters based on ACF/PACF and seasonality
p, q = 1, 1  # Modify based on your ACF/PACF plots
P, D, Q, m = 1, 0, 1, 12  # Assuming yearly seasonality (m=12)

# Build and fit the SARIMA model
model = SARIMAX(train, order=(p, d, q), seasonal_order=(P, D, Q, m))
model_fit = model.fit(disp=False)

# Forecast for the test set
forecast = model_fit.get_forecast(steps=len(test))
predicted_mean = forecast.predicted_mean
confidence_intervals = forecast.conf_int()

# Plot actual vs predicted with confidence intervals
plt.plot(train.index, train, label='Training Data')
plt.plot(test.index, test, label='Actual Test Data', color='red')
plt.plot(test.index, predicted_mean, label='Predicted Test Data', color='green')
plt.fill_between(test.index, confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1], color='gray', alpha=0.2)

plt.ylabel('Total Number of Tourists Arrivals')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()

# Print summary of the model
print(model_fit.summary())
