In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from statsmodels.tsa.stattools import adfuller
#!pip install pmdarima --quiet
import pmdarima as pm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima import auto_arima


In [None]:
ari = pd.read_csv("data_ari.csv",sep=",",dtype={'location':str,'year_week':str,
                                                'value':np.float32,'relative_humidity_2m':np.float64,
                                                'temperature_2m_max':np.float64,'temperature_2m_min':np.float64},
                                                parse_dates=['truth_date'])
ari = ari.drop(columns=['Unnamed: 0']).reset_index(drop=True)


In [None]:
#BE#
ari_BE = ari[ari['location']=="BE"]
train = ari_BE[ari_BE['truth_date']<="2023-09-01"]
test = ari_BE[ari_BE['truth_date']>"2023-09-01"]

In [None]:
test.head()

In [None]:
import matplotlib.dates as mdates
plt.figure(figsize=(15, 7))
plt.title("Number of ARI in BE incidences by Date")
plt.xlabel('Date')
plt.ylabel('Incidences')
plt.plot(ari_BE['truth_date'], ari_BE['value'], label='Indicences', color='blue')
plt.gca().xaxis.set_major_locator(mdates.YearLocator(1))  # Tick every year
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.legend()  
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

plot_acf(ari_BE['value'], lags=104)  # Check 2 years (104 weeks)
plt.show()

In [None]:
# Calculate rolling statistics
ari_BE["rolling_avg"] = ari_BE["value"].rolling(window=12).mean()  # 12 represents 12 periods (e.g., months)
ari_BE["rolling_std"] = ari_BE["value"].rolling(window=12).std()

# Plot rolling statistics
plt.figure(figsize=(15, 7))
plt.plot(ari_BE["truth_date"], ari_BE["value"], color='#379BDB', label='Original Data')  # Original data
plt.plot(ari_BE["truth_date"], ari_BE["rolling_avg"], color='#D22A0D', label='Rolling Mean')  # Rolling mean
plt.plot(ari_BE["truth_date"], ari_BE["rolling_std"], color='#142039', label='Rolling Std Dev')  # Rolling std dev
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.xlabel('Date')
plt.ylabel('Value')
plt.show()


In [None]:
#Augmented Dickey–Fuller test:
print('Results of Dickey Fuller Test:')
dftest = adfuller(ari_BE['value'], autolag='AIC')

dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
    
print(dfoutput)

The data is stationary, so we can apply. This means that we can apply models without diff

In [None]:
ARIMA_model = pm.auto_arima(train['value'], 
                      start_p=0, 
                      start_q=0,
                      test='adf', # use adftest to find optimal 'd'
                      max_p=4, max_q=4, # maximum p and q
                      m=1, # frequency of series (if m==1, seasonal is set to FALSE automatically)
                      d=None,# let model determine 'd'
                      seasonal=True, # No Seasonality for standard ARIMA
                      trace=False, #logs 
                      error_action='warn', #shows errors ('ignore' silences these)
                      suppress_warnings=True,
                      stepwise=True)

In [None]:
ARIMA_model.plot_diagnostics(figsize=(15,12))
plt.show()

In [None]:
# Seasonal - fit stepwise auto-ARIMA
SARIMA_model = pm.auto_arima(train["value"], start_p=1, start_q=1,
                        test='adf',
                        max_p=2, max_q=2, 
                        m=52, #52 is the frequncy of the cycle because is weekly
                        start_P=1, 
                        seasonal=True, #set to seasonal
                        d=None, 
                        D=1, #order of the seasonal differencing
                        trace=False,
                        error_action='ignore',  
                        suppress_warnings=True, 
                        stepwise=True)

In [None]:
SARIMA_model.plot_diagnostics(figsize=(15,12))
plt.show()

In [None]:
SARIMA_model.summary()

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA

# Load the dataset
df = ari

# Convert 'truth_date' to datetime and sort data
df['truth_date'] = pd.to_datetime(df['truth_date'])
df = df.sort_values(by='truth_date')

# Select a single location (e.g., Czech Republic "CZ")
df_single = df[df["location"] == "CZ"].copy()

# Set 'truth_date' as index
df_single.set_index("truth_date", inplace=True)

# Train-test split (80% train, 20% test)
split_point = int(len(df_single) * 0.8)
train, test = df_single.iloc[:split_point], df_single.iloc[split_point:]

# Rolling 4-week forecast function
def rolling_forecast(ARIMA_order, train, test, step=4):
    history = list(train["value"])  # Start with training data
    predictions = []  
    test_index = test.index  # Store test index for plotting

    for i in range(0, len(test), step):
        # Fit ARIMA model on the available data
        model = ARIMA(history, order=ARIMA_order)
        model_fit = model.fit()
        
        # Forecast next 'step' periods (4 weeks)
        forecast = model_fit.forecast(steps=step)
        
        # Store predictions
        predictions.extend(forecast)
        
        # Move forward by 'step' weeks in real data
        actual_values = test["value"].iloc[i:i+step].tolist()
        history.extend(actual_values)  # Update history with real values

    # Convert predictions to Pandas Series
    predicted_series = pd.Series(predictions[:len(test)], index=test_index)  
    mape = mean_absolute_percentage_error(test["value"], predicted_series)
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
    # Plot results
    plt.figure(figsize=(15, 7))
    plt.plot(train.index, train["value"], color='blue', label='Training Data')  
    plt.plot(test.index, test["value"], color='orange', label='Actual Test Data')  
    plt.plot(predicted_series, color='green', linestyle='dashed', label='Rolling 4-Week Forecast')  
    plt.legend(loc='upper left')
    plt.title("Rolling ARIMA - 4-Week Forecasting")
    plt.xlabel("Date")
    plt.ylabel("Value")
    plt.show()
    return mape
# Run rolling forecast with 4-week step
mape_arima = rolling_forecast((2, 1, 2), train, test, step=4)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import numpy as np

# Define exogenous variables
exog_vars = ['relative_humidity_2m', 'temperature_2m_max', 'temperature_2m_min', 'covid']

# Function to calculate Mean Absolute Percentage Error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Rolling 4-week forecast function with SARIMAX
def rolling_forecast_sarimax(order, train, test, exog_vars, step=4):
    history = list(train["value"])  # Start with training data
    exog_history = train[exog_vars].copy()  # Exogenous variables history
    predictions = []  
    test_index = test.index  # Store test index for plotting

    for i in range(0, len(test), step):
        # Ensure exog_train matches the length of history
        exog_train = exog_history.iloc[:len(history)]  

        # Fit SARIMAX model with exogenous variables
        model = SARIMAX(history, order=order, exog=exog_train)
        model_fit = model.fit(disp=False)

        # Forecast next 'step' periods using test exogenous data
        exog_forecast = test[exog_vars].iloc[i:i+step]  

        # **Fix: Ensure exog_forecast has the correct shape**
        if len(exog_forecast) < step:
            # Pad with the last available values if not enough rows
            last_row = exog_forecast.iloc[-1:]
            missing_rows = step - len(exog_forecast)
            exog_forecast = pd.concat([exog_forecast] + [last_row] * missing_rows)

        forecast = model_fit.forecast(steps=step, exog=exog_forecast)

        # Store predictions
        predictions.extend(forecast)

        # Move forward by 'step' weeks in real data
        actual_values = test["value"].iloc[i:i+step].tolist()
        history.extend(actual_values)  # Update history with real values
        exog_history = pd.concat([exog_history, test[exog_vars].iloc[i:i+step]])  # Update exogenous data

    # Convert predictions to Pandas Series
    predicted_series = pd.Series(predictions[:len(test)], index=test_index)  

    # Calculate MAPE
    mape = mean_absolute_percentage_error(test["value"], predicted_series)
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

    # Plot results
    plt.figure(figsize=(15, 7))
    plt.plot(train.index, train["value"], color='blue', label='Training Data')  
    plt.plot(test.index, test["value"], color='orange', label='Actual Test Data')  
    plt.plot(predicted_series, color='green', linestyle='dashed', label='Rolling 4-Week Forecast')  
    plt.legend(loc='upper left')
    plt.title("Rolling SARIMAX - 4-Week Forecasting")
    plt.xlabel("Date")
    plt.ylabel("Value")
    plt.show()
    return model, predicted_series, mape
# Run rolling forecast with SARIMAX
sarimax_mod,prediction_sarimax,mape_sarimax = rolling_forecast_sarimax((2, 1, 2), train, test, exog_vars, step=4)


In [None]:
mape_sarimax

In [None]:
import joblib

In [None]:
ARIMA_model = pm.auto_arima(train['value'],
                      exogenous = train[['relative_humidity_2m','temperature_2m_max','temperature_2m_min','covid']],
                      test='adf', # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=52,
                      d=None,# let model determine 'd'
                      seasonal=True, # No Seasonality for standard ARIMA
                      trace=True, #logs
                      error_action='warn', #shows errors ('ignore' silences these)
                      suppress_warnings=True,
                      n_jobs=-1,
                      stepwise=True)
model_path = "arima_model_be_v2.pkl"

# Save the model
joblib.dump(ARIMA_model, model_path)
print(f"Model saved at: {model_path}")

In [None]:
ARIMA_model.summary()

In [None]:
ARIMA_model.plot_diagnostics(figsize=(15,12))
plt.show()