# 0.0 Preparation

In [26]:
# Import Pandas 
import pandas as pd

## Set view to two decimal points only 
pd.set_option('float_format', '{:2f}' .format) 

# Import NumPy
import numpy as np

# Import Scipy
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram, linkage


# Import Matplotlib
import matplotlib.pyplot as plt

# Import from statsmodel
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA


# Import from sci-kit learn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import AgglomerativeClustering, KMeans

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA


# Import Datetime 
from datetime import datetime as dt, timedelta


# 1.0 Data Loading, Transformation

In [5]:
# Initialize dataframe 
months = [
    "01-2022", "02-2022", "03-2022", "04-2022", "05-2022", "06-2022", 
    "07-2022", "08-2022", "09-2022", "10-2022", "11-2022", "12-2022",
    "01-2023", "02-2023", "03-2023", "04-2023", "05-2023", "06-2023", 
    "07-2023", "08-2023", "09-2023", "10-2023", "11-2023", "12-2023",
    "01-2024", "02-2024", "03-2024", "04-2024", "05-2024", "06-2024", 
    "07-2024", "08-2024", "09-2024", "10-2024", "11-2024", "12-2024"
]

tourist_data = [
    35799, 80882, 150597, 186751, 201368, 193650, 183096, 157338, 
    122373, 135252, 139634, 172852, 177860, 160772, 184970, 213736,
    207512, 196960, 207696, 178589, 124491, 139008, 149625, 179200,
    172913, 174364, 191326, 182647, 205246, 183755, 159826, 121975,
    145021, 162451, 162455, 142728
]

df = pd.DataFrame({
    "month-year" : months,
    "tourists": tourist_data  
})

display(
    df.head(5),
    df.tail(5)
)

Unnamed: 0,month-year,tourists
0,01-2022,35799
1,02-2022,80882
2,03-2022,150597
3,04-2022,186751
4,05-2022,201368


Unnamed: 0,month-year,tourists
31,08-2024,121975
32,09-2024,145021
33,10-2024,162451
34,11-2024,162455
35,12-2024,142728


In [9]:
final_df = (
    df
    .assign(date = lambda x: pd.to_datetime(x["month-year"]))
    .assign(
        month = lambda x: x["date"].dt.month,
        year = lambda x: x["date"].dt.year
    )
    .drop(columns = ["month-year", "date"])
    [["month", "year", "tourists"]]
)

final_df

  .assign(date = lambda x: pd.to_datetime(x["month-year"]))


Unnamed: 0,month,year,tourists
0,1,2022,35799
1,2,2022,80882
2,3,2022,150597
3,4,2022,186751
4,5,2022,201368
5,6,2022,193650
6,7,2022,183096
7,8,2022,157338
8,9,2022,122373
9,10,2022,135252


In [22]:
# Goal: predict values from 2025

model_df = (
    final_df
    .pivot_table(
        values='tourists',  # Pivot to get total sales as values
        index='month',        # Set 'week' as rows
        columns='year',      # Set 'year' as columns
    )
    .reset_index()
)

model_df

year,month,2022,2023,2024
0,1,35799.0,177860.0,172913.0
1,2,80882.0,160772.0,174364.0
2,3,150597.0,184970.0,191326.0
3,4,186751.0,213736.0,182647.0
4,5,201368.0,207512.0,205246.0
5,6,193650.0,196960.0,183755.0
6,7,183096.0,207696.0,159826.0
7,8,157338.0,178589.0,121975.0
8,9,122373.0,124491.0,145021.0
9,10,135252.0,139008.0,162451.0


In [24]:
# Initialize dictionary to store forecasts
forecasts = {}

# Loop through each month and fit an ARIMA model
for i, month in enumerate(model_df["month"]):
    # Get the time series for the current month (2022, 2023, 2024 values)
    ts = model_df.iloc[i, 1:].values  

    # Fit ARIMA model (p, d, q can be tuned)
    model = ARIMA(ts, order=(1, 1, 1))  
    model_fit = model.fit()
    
    # Forecast 1 step ahead (2025 value)
    forecast = model_fit.forecast(steps=1)[0]
    
    # Store the forecasted value
    forecasts[month] = forecast

# Convert forecasts into a DataFrame
forecast_df = (
    pd.DataFrame(
        list(forecasts.items()), 
        columns=["month", "Forecast_2025"]
        )
        .assign(Forecast_2025 = lambda x: x["Forecast_2025"].astype(int))
        )

forecast_df

  warn('Too few observations to estimate starting parameters%s.'


Unnamed: 0,month,Forecast_2025
0,1,146323
1,2,161727
2,3,192503
3,4,193003
4,5,206013
5,6,190545
6,7,167744
7,8,124315
8,9,156059
9,10,175847


In [25]:
(
    pd.merge(
        model_df,
        forecast_df,
        on = "month", 
        how = "left"

    )
)

Unnamed: 0,month,2022,2023,2024,Forecast_2025
0,1,35799.0,177860.0,172913.0,146323
1,2,80882.0,160772.0,174364.0,161727
2,3,150597.0,184970.0,191326.0,192503
3,4,186751.0,213736.0,182647.0,193003
4,5,201368.0,207512.0,205246.0,206013
5,6,193650.0,196960.0,183755.0,190545
6,7,183096.0,207696.0,159826.0,167744
7,8,157338.0,178589.0,121975.0,124315
8,9,122373.0,124491.0,145021.0,156059
9,10,135252.0,139008.0,162451.0,175847
