Implementing and Managing a Cached Function for Optimised Performance

In [3]:
from functools import lru_cache

@lru_cache(maxsize=128)
def cached_function(args):
    # This function is decorated with @lru_cache, which enables caching of its results.
    # The 'maxsize' parameter specifies that up to 128 results can be stored in the cache.
    # Caching can significantly improve performance by avoiding redundant calculations for repeated inputs.
    # When the function is called with the same arguments, the cached result is returned instead of recomputing it.
    # The function implementation should be inserted here.
    pass

# The cache associated with the 'cached_function' is cleared.
# This is useful in scenarios where the cached results are no longer valid or when it is necessary to free up memory.
cached_function.cache_clear()


Implementation of segmented approach for forecasting asylum applications using ML models

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, median_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM, Attention, Input, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
import matplotlib

# Set the font to Times New Roman globally for all plots
matplotlib.rcParams['font.family'] = 'Times New Roman'

# Set random seeds for reproducibility
random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
random.seed(random_seed)

# Ensure reproducibility with TensorFlow 2.x by setting deterministic operations
tf.keras.utils.set_random_seed(random_seed)
tf.config.experimental.enable_op_determinism()

# Create an output directory for graphs and other results
output_dir = "Segmented Approach Outputs"
os.makedirs(output_dir, exist_ok=True)

# Load and preprocess data
data = pd.read_csv("final_thesis_data.csv")
data['year_month'] = pd.to_datetime(data['year_month'])
data = data.sort_values(by=['country', 'year_month'])

# Aggregate data by country, year_month, sex, and age_group
data_agg = data.groupby(['country', 'year_month', 'sex', 'age_group']).sum().reset_index()

# Define variables to lag and create lagged variables
variables_to_lag = {
    "illegal_border_crossings": [3],
    "push_factor_index": [3],
    "push_factor_index_high_level": [2],
    "deaths_civilians": [12],
    "gdp_per_capita_current_usd": [12],
    "gdp_per_capita_growth": [12],
    "regime_end_type": [0],
    "state_fiscal_source_revenue": [0],
    "state_authority_over_territory": [0],
    "political_polarisation": [0],
    "political_violence": [0],
    "domestic_autonomy": [0],
    "rule_of_law": [0],
}

# Create lagged variables for the predictors
for var, lags in variables_to_lag.items():
    for lag in lags:
        lagged_var_name = f"{var}_lag_{lag}"
        data_agg[lagged_var_name] = data_agg.groupby(['country', 'sex', 'age_group'])[var].shift(lag)

# Define rolling window sizes
rolling_windows = [3, 6, 12, 24]

# Calculate rolling mean and standard deviation for asylum applications with different window sizes
for window in rolling_windows:
    data_agg[f'asy_applications_rolling_mean_{window}'] = data_agg.groupby(['country', 'sex', 'age_group'])['asy_applications'].transform(lambda x: x.rolling(window=window, min_periods=1).mean())
    data_agg[f'asy_applications_rolling_sd_{window}'] = data_agg.groupby(['country', 'sex', 'age_group'])['asy_applications'].transform(lambda x: x.rolling(window=window, min_periods=1).std())

# Calculate rolling mean and standard deviation for push_factor_index with different window sizes
for window in rolling_windows:
    data_agg[f'push_factor_index_rolling_mean_{window}'] = data_agg.groupby(['country', 'sex', 'age_group'])['push_factor_index'].transform(lambda x: x.rolling(window=window, min_periods=1).mean())
    data_agg[f'push_factor_index_rolling_sd_{window}'] = data_agg.groupby(['country', 'sex', 'age_group'])['push_factor_index'].transform(lambda x: x.rolling(window=window, min_periods=1).std())

# Calculate exponential moving average for asylum applications
data_agg['asy_applications_ewm_mean'] = data_agg.groupby(['country', 'sex', 'age_group'])['asy_applications'].transform(lambda x: x.ewm(span=12, adjust=False).mean())
data_agg['asy_applications_ewm_std'] = data_agg.groupby(['country', 'sex', 'age_group'])['asy_applications'].transform(lambda x: x.ewm(span=12, adjust=False).std())

# Add temporal features for seasonality
data_agg['month'] = data_agg['year_month'].dt.month
data_agg['month_sin'] = np.sin(2 * np.pi * data_agg['month'] / 12)
data_agg['month_cos'] = np.cos(2 * np.pi * data_agg['month'] / 12)

# Define the predictors and the target variable
predictors = [
    "illegal_border_crossings_lag_3", "push_factor_index_lag_3",
    "push_factor_index_high_level_lag_2", "deaths_civilians_lag_12",
    "gdp_per_capita_current_usd_lag_12", "gdp_per_capita_growth_lag_12",
    "regime_end_type_lag_0", "state_fiscal_source_revenue_lag_0",
    "state_authority_over_territory_lag_0", "political_polarisation_lag_0",
    "political_violence_lag_0", "domestic_autonomy_lag_0",
    "rule_of_law_lag_0",
    "asy_applications_rolling_mean_3", "asy_applications_rolling_sd_3",
    "asy_applications_rolling_mean_6", "asy_applications_rolling_sd_6",
    "asy_applications_rolling_mean_12", "asy_applications_rolling_sd_12",
    "asy_applications_rolling_mean_24", "asy_applications_rolling_sd_24",
    "push_factor_index_rolling_mean_3", "push_factor_index_rolling_sd_3",
    "push_factor_index_rolling_mean_6", "push_factor_index_rolling_sd_6",
    "push_factor_index_rolling_mean_12", "push_factor_index_rolling_sd_12",
    "push_factor_index_rolling_mean_24", "push_factor_index_rolling_sd_24",
    "asy_applications_ewm_mean", "asy_applications_ewm_std",
    "month_sin", "month_cos"
]
target_var = "asy_applications"

# Handle missing values using Iterative Imputer
imputer = IterativeImputer(max_iter=10, random_state=random_seed)
data_agg[predictors] = imputer.fit_transform(data_agg[predictors])

# Scale the predictors and target variable using RobustScaler
scaler_x = RobustScaler()
scaler_y = RobustScaler()
data_agg[predictors] = scaler_x.fit_transform(data_agg[predictors])
data_agg[target_var] = scaler_y.fit_transform(data_agg[target_var].values.reshape(-1, 1)).flatten()

# Define a DataFrame to store residuals data for all months
all_residuals = pd.DataFrame()

# Initialise storage for variance scores for each model
variance_scores = {
    'XGBoost': [],
    'LSTM': [],
    'NN': []
}

# Initialise a list to store DataFrames of performance metrics for each group
metrics_per_group_list = []

# Function to create and train the LSTM model with L2 regularisation, dropout rate, and attention mechanism
def create_and_train_lstm_with_attention(X_train, y_train, X_val, y_val, units_1=32, units_2=32, dropout_rate=0.5, learning_rate=0.001, l2_lambda=0.01):
    # Define the input layer for the model
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
    
    # Define LSTM layers with dropout and L2 regularisation
    lstm_out = LSTM(units_1, return_sequences=True, kernel_regularizer=l2(l2_lambda))(inputs)
    lstm_out = Dropout(dropout_rate)(lstm_out)
    
    lstm_out = LSTM(units_2, return_sequences=True, kernel_regularizer=l2(l2_lambda))(lstm_out)
    lstm_out = Dropout(dropout_rate)(lstm_out)
    
    # Add an attention layer
    attention_out = Attention()([lstm_out, lstm_out])
    attention_out = Dense(units_2, activation='relu')(attention_out)
    attention_out = Flatten()(attention_out)
    
    # Output layer with linear activation
    outputs = Dense(1, kernel_regularizer=l2(l2_lambda), activation='linear')(attention_out)
    
    # Compile the model with Adam optimizer
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    
    # Add early stopping based on validation loss
    early_stopping_lstm = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping_lstm], verbose=0)
    
    # Calculate explained variance on validation data
    val_pred = model.predict(X_val).flatten()
    val_explained_variance = explained_variance_score(y_val, val_pred)
    variance_scores['LSTM'].append(val_explained_variance)
    
    return model, history

# Function to create and train a neural network model with optimal hyperparameters, L2 regularisation, and dropout
def create_best_nn_model(X_train, y_train, X_val, y_val, dropout_rate=0.3, units_1=128, units_2=64, learning_rate=0.001, l2_lambda=0.01):
    # Define the sequential model
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  # Ensure this is the first layer
        Dense(units_1, activation='relu', kernel_regularizer=l2(l2_lambda)),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(units_2, activation='relu', kernel_regularizer=l2(l2_lambda)),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(1, kernel_regularizer=l2(l2_lambda), activation='linear')  # Use linear activation
    ])
    # Compile the model with Adam optimizer
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    early_stopping_nn = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=200, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping_nn], verbose=0)
    
    # Calculate explained variance on validation data
    val_pred = model.predict(X_val).flatten()
    val_explained_variance = explained_variance_score(y_val, val_pred)
    variance_scores['NN'].append(val_explained_variance)
    
    return model, history

# Function to train an XGBoost model with optimal hyperparameters
def train_xgboost_model(X_train, y_train, X_val, y_val):
    # Define the XGBoost model with the specified hyperparameters
    model = xgb.XGBRegressor(
        learning_rate=0.2,
        max_depth=7,
        n_estimators=100,
        objective='reg:squarederror',
        random_state=random_seed,
        early_stopping_rounds=10
    )
    # Fit the model to the training data
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    # Calculate explained variance on validation data
    val_pred = model.predict(X_val)
    val_explained_variance = explained_variance_score(y_val, val_pred)
    variance_scores['XGBoost'].append(val_explained_variance)
    
    return model, val_explained_variance  # Return the model and variance score for consistency

# Function to split data for training and validation based on quarters
def split_train_val(data, quarter='Q1'):
    # Convert dates to quarters
    data['quarter'] = data['year_month'].dt.to_period('Q')
    val_data = data[data['quarter'].astype(str).str.endswith(quarter)]
    train_data = data[~data.index.isin(val_data.index)]
    return train_data, val_data

# Function to calculate confidence intervals for predictions
def calculate_confidence_intervals(predictions, confidence_level=0.95):
    ## Calculate the confidence intervals for the predictions ## 
    mean_preds = np.mean(predictions, axis=0)
    std_dev = np.std(predictions, axis=0)
    z_score = 1.96  # Corresponds to 95% confidence interval
    lower_bound = mean_preds - z_score * std_dev
    upper_bound = mean_preds + z_score * std_dev
    return mean_preds, lower_bound, upper_bound

# Function for bootstrapping predictions for a model
def bootstrap_predictions(model_func, X_train, y_train, X_val, y_val, X_test, n_iterations=20):
    predictions = []
    for _ in range(n_iterations):
        train_indices = np.random.choice(range(len(X_train)), size=len(X_train), replace=True)
        X_train_resample = X_train[train_indices]
        y_train_resample = y_train[train_indices]

        # Check for empty arrays
        if X_train_resample.size == 0 or y_train_resample.size == 0:
            continue  # Skip this iteration if data is empty

        # Reshape for LSTM if the function name matches
        if model_func.__name__ == 'create_and_train_lstm_with_attention':
            X_train_resample = np.reshape(X_train_resample, (X_train_resample.shape[0], 1, X_train_resample.shape[1]))
            X_val_resample = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))
            X_test_resample = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
        else:
            X_val_resample = X_val  # No reshaping needed for non-LSTM models
            X_test_resample = X_test

        # Train the model
        model, _ = model_func(X_train_resample, y_train_resample, X_val_resample, y_val)

        # Predict on test data
        preds = model.predict(X_test_resample).flatten()
        predictions.append(preds)

    return np.array(predictions)

# Function to compute weights inversely proportional to RMSE
def calculate_weights(rmse_values):
    ## Calculate weights inversely proportional to the RMSE ##
    inv_rmse = 1 / np.array(rmse_values)
    return inv_rmse / np.sum(inv_rmse)

# Function to plot combined residuals over time
def plot_combined_residuals(all_residuals):
    countries = all_residuals['country'].unique()
    for country in countries:
        plt.figure(figsize=(15, 10))
        sex_age_groups = all_residuals[all_residuals['country'] == country].groupby(['sex', 'age_group'])

        for i, ((sex, age_group), _) in enumerate(sex_age_groups, start=1):
            plt.subplot(2, 2, i)
            # Creating filters specific for each DataFrame's index
            group_filter = (all_residuals['country'] == country) & (all_residuals['sex'] == sex) & (all_residuals['age_group'] == age_group)
            group_data = all_residuals.loc[group_filter]

            # Plot residuals for each dataset with different markers and colours
            train_data = group_data[group_data['data_split'] == 'train']
            val_data = group_data[group_data['data_split'] == 'val']
            test_data = group_data[group_data['data_split'] == 'test']

            plt.scatter(train_data['year_month'], train_data['Residuals'], alpha=0.5, label='Train', marker='o', color='blue')
            plt.scatter(val_data['year_month'], val_data['Residuals'], alpha=0.5, label='Validation', marker='x', color='green')
            plt.scatter(test_data['year_month'], test_data['Residuals'], alpha=0.5, label='Test', marker='^', color='red')

            plt.title(f'Residuals - {country}, {sex}, {age_group}')
            plt.xlabel('Date')
            plt.ylabel('Residuals')
            plt.axhline(0, color='red', linestyle='--')
            plt.legend()
            plt.grid(True)

        plt.tight_layout()
        plt.savefig(f'{output_dir}/Residuals_{country}.png')
        plt.close()

# Function to plot average feature importances for each month
def plot_aggregated_feature_importances(xgboost_models, predictors, output_dir, month):
    # Initialise a list to hold the aggregated importances
    aggregated_importances = np.zeros(len(predictors))
    
    # Compute the mean importances for this month's models
    importances = np.array([model.feature_importances_ for model in xgboost_models])
    mean_importances = np.mean(importances, axis=0)
    
    # Sorting indices for plotting (highest to lowest)
    sorted_indices = np.argsort(mean_importances)[::-1]
    
    # Creating the plot
    plt.figure(figsize=(10, 8))
    plt.barh(np.array(predictors)[sorted_indices], mean_importances[sorted_indices], color='skyblue')
    plt.xlabel('Average Feature Importance')
    plt.ylabel('Features')
    plt.title(f'Average Feature Importance for {month}')
    plt.gca().invert_yaxis()  # Show most important feature at the top
    plt.tight_layout()
    
    # Save the plot to specified directory
    plt.savefig(f'{output_dir}/Aggregated_Feature_Importances_{month}.png')
    plt.close()

# Function to calculate explained variance at the aggregated level per month
def calculate_aggregated_explained_variance_per_month(results_df, prediction_column):
    # Aggregate by country and month
    aggregated_df = results_df.groupby(['country', 'year_month']).sum(numeric_only=True).reset_index()

    # Calculate explained variance
    explained_variance = explained_variance_score(aggregated_df['True Values'], aggregated_df[prediction_column])

    return pd.DataFrame({'Date': [results_df['year_month'].iloc[0]], 'Explained Variance': [explained_variance]})

# Initialise DataFrames to store explained variance for each model per month
explained_variance_lstm_df = pd.DataFrame(columns=['Date', 'Explained Variance'])
explained_variance_nn_df = pd.DataFrame(columns=['Date', 'Explained Variance'])
explained_variance_xgb_df = pd.DataFrame(columns=['Date', 'Explained Variance'])
explained_variance_ensemble_df = pd.DataFrame(columns=['Date', 'Explained Variance'])

# Initialise an empty DataFrame to store results for all months
all_results_df = pd.DataFrame()

# Initialise an empty DataFrame to store the explained variance for all dates
explained_variance_df = pd.DataFrame(columns=['Date', 'Explained Variance XGBoost', 'Explained Variance LSTM', 'Explained Variance NN', 'Explained Variance Ensemble'])

# Define prediction dates
prediction_dates = ['2024-01-01', '2024-02-01', '2024-03-01']

# TimeSeriesSplit object for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Loop through each prediction date and generate forecasts
for date in prediction_dates:
    print(f"Processing predictions for {date}")
    
    # Split the data into training and testing sets
    train_data, val_data = split_train_val(data_agg, quarter='Q4')
    test_data = data_agg[data_agg['year_month'] == date]
    
    X_train = train_data[predictors]
    y_train = train_data[target_var]
    X_val = val_data[predictors]
    y_val = val_data[target_var]
    X_test = test_data[predictors]
    y_test = test_data[target_var]
    
    # Perform walk-forward validation
    lstm_history = []
    nn_history = []
    xgboost_models = []

    # Convert to NumPy arrays for model input
    X_train_np = X_train.to_numpy()
    y_train_np = y_train.to_numpy()
    X_val_np = X_val.to_numpy()
    y_val_np = y_val.to_numpy()
    X_test_np = X_test.to_numpy()
    y_test_np = y_test.to_numpy()

    # Initialise lists to store RMSE values
    lstm_rmse_train = []
    nn_rmse_train = []
    xgboost_rmse_train = []

    lstm_rmse_val = []
    nn_rmse_val = []
    xgboost_rmse_val = []

    # Perform time-series cross-validation
    for train_index, val_index in tscv.split(X_train_np):
        X_train_cv, X_val_cv = X_train_np[train_index], X_train_np[val_index]
        y_train_cv, y_val_cv = y_train_np[train_index], y_train_np[val_index]
        
        X_train_lstm = np.reshape(X_train_cv, (X_train_cv.shape[0], 1, X_train_cv.shape[1]))
        X_val_lstm = np.reshape(X_val_cv, (X_val_cv.shape[0], 1, X_val_cv.shape[1]))
        
        # Number of models to train for ensemble
        num_models = 5
        
        lstm_models = []
        for i in range(num_models):
            # Set a new random seed for each model based on the loop index
            seed = random_seed + i
            np.random.seed(seed)
            tf.random.set_seed(seed)
            random.seed(seed)
            tf.keras.utils.set_random_seed(seed)

            # Train LSTM model
            model, history = create_and_train_lstm_with_attention(X_train_lstm, y_train_cv, X_val_lstm, y_val_cv)
            lstm_models.append(model)
            lstm_history.append(history)

            # Compute RMSE for train and validation sets
            train_pred_lstm = model.predict(X_train_lstm).flatten()
            val_pred_lstm = model.predict(X_val_lstm).flatten()

            lstm_rmse_train.append(np.sqrt(mean_squared_error(y_train_cv, train_pred_lstm)))
            lstm_rmse_val.append(np.sqrt(mean_squared_error(y_val_cv, val_pred_lstm)))
        
        nn_models = []
        for i in range(num_models):
            seed = random_seed + i
            np.random.seed(seed)
            tf.random.set_seed(seed)
            random.seed(seed)
            tf.keras.utils.set_random_seed(seed)
            
            # Train NN model
            model, history = create_best_nn_model(X_train_cv, y_train_cv, X_val_cv, y_val_cv)
            nn_models.append(model)
            nn_history.append(history)

            # Compute RMSE for train and validation sets
            train_pred_nn = model.predict(X_train_cv).flatten()
            val_pred_nn = model.predict(X_val_cv).flatten()

            nn_rmse_train.append(np.sqrt(mean_squared_error(y_train_cv, train_pred_nn)))
            nn_rmse_val.append(np.sqrt(mean_squared_error(y_val_cv, val_pred_nn)))

        xgboost_models_this_month = []
        for i in range(num_models):
            seed = random_seed + i
            np.random.seed(seed)
            tf.random.set_seed(seed)
            random.seed(seed)
            tf.keras.utils.set_random_seed(seed)
            
            # Train XGBoost model
            model, _ = train_xgboost_model(X_train_cv, y_train_cv, X_val_cv, y_val_cv)
            xgboost_models_this_month.append(model)  # Append only the model

            # Compute RMSE for train and validation sets
            train_pred_xgb = model.predict(X_train_cv)
            val_pred_xgb = model.predict(X_val_cv)

            xgboost_rmse_train.append(np.sqrt(mean_squared_error(y_train_cv, train_pred_xgb)))
            xgboost_rmse_val.append(np.sqrt(mean_squared_error(y_val_cv, val_pred_xgb)))

        xgboost_models.append(xgboost_models_this_month)

    # Calculate average RMSE over all models for training and validation
    avg_lstm_rmse_train = np.mean(lstm_rmse_train)
    avg_nn_rmse_train = np.mean(nn_rmse_train)
    avg_xgboost_rmse_train = np.mean(xgboost_rmse_train)

    avg_lstm_rmse_val = np.mean(lstm_rmse_val)
    avg_nn_rmse_val = np.mean(nn_rmse_val)
    avg_xgboost_rmse_val = np.mean(xgboost_rmse_val)

    # Calculate weights based on the average RMSE
    train_weights = calculate_weights([avg_xgboost_rmse_train, avg_lstm_rmse_train, avg_nn_rmse_train])
    val_weights = calculate_weights([avg_xgboost_rmse_val, avg_lstm_rmse_val, avg_nn_rmse_val])

    # Generate bootstrapped predictions for each model
    lstm_bootstrap_preds = bootstrap_predictions(
        create_and_train_lstm_with_attention, X_train_np, y_train_np, X_val_np, y_val_np, X_test_np
    )
    nn_bootstrap_preds = bootstrap_predictions(
        create_best_nn_model, X_train_np, y_train_np, X_val_np, y_val_np, X_test_np
    )
    xgboost_bootstrap_preds = bootstrap_predictions(
        train_xgboost_model, X_train_np, y_train_np, X_val_np, y_val_np, X_test_np
    )

    # Calculate mean and confidence intervals for each model's predictions
    lstm_mean, lstm_lower_ci, lstm_upper_ci = calculate_confidence_intervals(lstm_bootstrap_preds)
    nn_mean, nn_lower_ci, nn_upper_ci = calculate_confidence_intervals(nn_bootstrap_preds)
    xgboost_mean, xgboost_lower_ci, xgboost_upper_ci = calculate_confidence_intervals(xgboost_bootstrap_preds)

    # Calculate CI widths and choose the model with the widest CI for ensemble predictions
    lstm_ci_width = lstm_upper_ci - lstm_lower_ci
    nn_ci_width = nn_upper_ci - nn_lower_ci
    xgboost_ci_width = xgboost_upper_ci - xgboost_lower_ci

    # Create an array to hold ensemble predictions using the model with the widest CI
    ensemble_mean_widest_ci = np.zeros_like(lstm_mean)
    ensemble_lower_ci = np.zeros_like(lstm_lower_ci)  # Define these arrays to store ensemble CI data
    ensemble_upper_ci = np.zeros_like(lstm_upper_ci)

    # Selecting the model with the widest CI for each instance
    for i in range(len(ensemble_mean_widest_ci)):
        if lstm_ci_width[i] >= nn_ci_width[i] and lstm_ci_width[i] >= xgboost_ci_width[i]:
            ensemble_mean_widest_ci[i] = lstm_mean[i]
            ensemble_lower_ci[i] = lstm_lower_ci[i]
            ensemble_upper_ci[i] = lstm_upper_ci[i]
        elif nn_ci_width[i] >= lstm_ci_width[i] and nn_ci_width[i] >= xgboost_ci_width[i]:
            ensemble_mean_widest_ci[i] = nn_mean[i]
            ensemble_lower_ci[i] = nn_lower_ci[i]
            ensemble_upper_ci[i] = nn_upper_ci[i]
        else:
            ensemble_mean_widest_ci[i] = xgboost_mean[i]
            ensemble_lower_ci[i] = xgboost_lower_ci[i]
            ensemble_upper_ci[i] = xgboost_upper_ci[i]

    # Rescale predictions back to original scale
    y_test_rescaled = scaler_y.inverse_transform(y_test_np.reshape(-1, 1)).flatten()
    ensemble_pred_rescaled = np.round(scaler_y.inverse_transform(ensemble_mean_widest_ci.reshape(-1, 1)).flatten())
    
    results_df = test_data[['country', 'sex', 'age_group', 'year_month']].copy()
    results_df['True Values'] = np.round(y_test_rescaled[:results_df.shape[0]])
    results_df['Ensemble Prediction'] = np.round(ensemble_pred_rescaled[:results_df.shape[0]])

    # Add lower and upper CIs for each model and the ensemble
    results_df['XGBoost Lower CI'] = np.round(scaler_y.inverse_transform(xgboost_lower_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    results_df['XGBoost Upper CI'] = np.round(scaler_y.inverse_transform(xgboost_upper_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])

    results_df['LSTM Lower CI'] = np.round(scaler_y.inverse_transform(lstm_lower_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    results_df['LSTM Upper CI'] = np.round(scaler_y.inverse_transform(lstm_upper_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])

    results_df['NN Lower CI'] = np.round(scaler_y.inverse_transform(nn_lower_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    results_df['NN Upper CI'] = np.round(scaler_y.inverse_transform(nn_upper_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    
    results_df['Ensemble Lower CI'] = np.round(scaler_y.inverse_transform(ensemble_lower_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    results_df['Ensemble Upper CI'] = np.round(scaler_y.inverse_transform(ensemble_upper_ci.reshape(-1, 1)).flatten()[:results_df.shape[0]])

    results_df['Residuals'] = results_df['True Values'] - results_df['Ensemble Prediction']

    # Store individual model predictions
    results_df['XGBoost Prediction'] = np.round(scaler_y.inverse_transform(xgboost_mean.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    results_df['LSTM Prediction'] = np.round(scaler_y.inverse_transform(lstm_mean.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    results_df['NN Prediction'] = np.round(scaler_y.inverse_transform(nn_mean.reshape(-1, 1)).flatten()[:results_df.shape[0]])
    
    # Calculate residuals for training and validation sets
    train_pred_xgb_list = [model.predict(X_train_np) for model in xgboost_models_this_month]
    train_pred_xgb = np.mean(train_pred_xgb_list, axis=0)
    
    train_pred_lstm_list = [model.predict(np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))).flatten() for model in lstm_models]
    train_pred_lstm = np.mean(train_pred_lstm_list, axis=0)
    
    train_pred_nn_list = [model.predict(X_train_np).flatten() for model in nn_models]
    train_pred_nn = np.mean(train_pred_nn_list, axis=0)

    # Compute weighted average for train predictions
    ensemble_train_pred = (train_weights[0] * train_pred_xgb +
                           train_weights[1] * train_pred_lstm +
                           train_weights[2] * train_pred_nn)

    # Create train meta-features without confidence intervals
    train_meta_features = pd.DataFrame({
        'XGBoost': train_pred_xgb,
        'LSTM': train_pred_lstm,
        'NN': train_pred_nn
    }, dtype='float32')  # Ensure inputs are floats
    
    train_pred_rescaled = np.round(scaler_y.inverse_transform(ensemble_train_pred.reshape(-1, 1)).flatten())
    y_train_rescaled = scaler_y.inverse_transform(y_train_np.reshape(-1, 1)).flatten()
    train_residuals = y_train_rescaled - train_pred_rescaled
    train_data['Residuals'] = train_residuals

    val_pred_xgb_list = [model.predict(X_val_np) for model in xgboost_models_this_month]
    val_pred_xgb = np.mean(val_pred_xgb_list, axis=0)

    val_pred_lstm_list = [model.predict(np.reshape(X_val_np, (X_val_np.shape[0], 1, X_val_np.shape[1]))).flatten() for model in lstm_models]
    val_pred_lstm = np.mean(val_pred_lstm_list, axis=0)
    
    val_pred_nn_list = [model.predict(X_val_np).flatten() for model in nn_models]
    val_pred_nn = np.mean(val_pred_nn_list, axis=0)

    # Compute weighted average for validation predictions
    ensemble_val_pred = (val_weights[0] * val_pred_xgb +
                         val_weights[1] * val_pred_lstm +
                         val_weights[2] * val_pred_nn)

    val_pred_rescaled = np.round(scaler_y.inverse_transform(ensemble_val_pred.reshape(-1, 1)).flatten())
    y_val_rescaled = scaler_y.inverse_transform(y_val_np.reshape(-1, 1)).flatten()
    val_residuals = y_val_rescaled - val_pred_rescaled
    val_data['Residuals'] = val_residuals

    # Append residuals to the all_residuals DataFrame
    test_residuals = results_df['True Values'] - results_df['Ensemble Prediction']
    test_data['Residuals'] = test_residuals
    
    # Add a column to identify data splits
    train_data['data_split'] = 'train'
    val_data['data_split'] = 'val'
    test_data['data_split'] = 'test'
    
    all_residuals = pd.concat([
        all_residuals,
        train_data[['country', 'sex', 'age_group', 'year_month', 'Residuals', 'data_split']],
        val_data[['country', 'sex', 'age_group', 'year_month', 'Residuals', 'data_split']],
        test_data[['country', 'sex', 'age_group', 'year_month', 'Residuals', 'data_split']]
    ])


    groups = results_df.groupby(['country', 'sex', 'age_group'])
    for (country, sex, age_group), group_data in groups:
        print(f"Residuals for Country: {country}, Sex: {sex}, Age Group: {age_group}")
        print(group_data[['year_month', 'Residuals']])
    
    results_df.to_csv(f'{output_dir}/forecasts_with_uncertainty_group_level_{date}.csv', index=False)
    
    metrics_per_group = []
    groups = results_df.groupby(['country', 'sex', 'age_group'])

    for (country, sex, age_group), group_data in groups:
        y_true_group = group_data['True Values'].values
        y_pred_group = group_data['Ensemble Prediction'].values
        y_pred_xgb_group = group_data['XGBoost Prediction'].values
        y_pred_lstm_group = group_data['LSTM Prediction'].values
        y_pred_nn_group = group_data['NN Prediction'].values
        
        if not np.any(np.isnan(y_true_group)) and not np.any(np.isnan(y_pred_group)):
            mse_group = mean_squared_error(y_true_group, y_pred_group)
            rmse_group = np.sqrt(mse_group)  
            mae_group = mean_absolute_error(y_true_group, y_pred_group)
            mdae_group = median_absolute_error(y_true_group, y_pred_group)
            mape_group = mean_absolute_percentage_error(y_true_group + 1e-6, y_pred_group)

            mse_xgb_group = mean_squared_error(y_true_group, y_pred_xgb_group)
            rmse_xgb_group = np.sqrt(mse_xgb_group)
            mae_xgb_group = mean_absolute_error(y_true_group, y_pred_xgb_group)
            mdae_xgb_group = median_absolute_error(y_true_group, y_pred_xgb_group)
            mape_xgb_group = mean_absolute_percentage_error(y_true_group + 1e-6, y_pred_xgb_group)

            mse_lstm_group = mean_squared_error(y_true_group, y_pred_lstm_group)
            rmse_lstm_group = np.sqrt(mse_lstm_group)
            mae_lstm_group = mean_absolute_error(y_true_group, y_pred_lstm_group)
            mdae_lstm_group = median_absolute_error(y_true_group, y_pred_lstm_group)
            mape_lstm_group = mean_absolute_percentage_error(y_true_group + 1e-6, y_pred_lstm_group)

            mse_nn_group = mean_squared_error(y_true_group, y_pred_nn_group)
            rmse_nn_group = np.sqrt(mse_nn_group)
            mae_nn_group = mean_absolute_error(y_true_group, y_pred_nn_group)
            mdae_nn_group = median_absolute_error(y_true_group, y_pred_nn_group)
            mape_nn_group = mean_absolute_percentage_error(y_true_group + 1e-6, y_pred_nn_group)
            
            metrics_per_group.append({
                'Country': country,
                'Sex': sex,
                'Age Group': age_group,
                'MSE Ensemble': np.round(mse_group),
                'RMSE Ensemble': np.round(rmse_group),
                'MAE Ensemble': np.round(mae_group),
                'MDAE Ensemble': np.round(mdae_group),
                'MAPE Ensemble': np.round(mape_group),
                'MSE XGBoost': np.round(mse_xgb_group),
                'RMSE XGBoost': np.round(rmse_xgb_group),
                'MAE XGBoost': np.round(mae_xgb_group),
                'MDAE XGBoost': np.round(mdae_xgb_group),
                'MAPE XGBoost': np.round(mape_xgb_group),
                'MSE LSTM': np.round(mse_lstm_group),
                'RMSE LSTM': np.round(rmse_lstm_group),
                'MAE LSTM': np.round(mae_lstm_group),
                'MDAE LSTM': np.round(mdae_lstm_group),
                'MAPE LSTM': np.round(mape_lstm_group),
                'MSE NN': np.round(mse_nn_group),
                'RMSE NN': np.round(rmse_nn_group),
                'MAE NN': np.round(mae_nn_group),
                'MDAE NN': np.round(mdae_nn_group),
                'MAPE NN': np.round(mape_nn_group)
            })

    metrics_group_df = pd.DataFrame(metrics_per_group)
    metrics_per_group_list.append(metrics_group_df)

    metrics_group_df.to_csv(f'{output_dir}/performance_metrics_per_group_{date}.csv', index=False)

    # Aggregate to country level by summing up the metrics
    metrics_per_country = metrics_group_df.groupby('Country').sum(numeric_only=True).reset_index()
    metrics_per_country.to_csv(f'{output_dir}/performance_metrics_per_country_{date}.csv', index=False)

    # Aggregate to total by summing up the country-level metrics
    overall_metrics = metrics_per_country.sum(numeric_only=True).to_dict()
    overall_metrics_df = pd.DataFrame([overall_metrics])
    overall_metrics_df.to_csv(f'{output_dir}/performance_metrics_overall_{date}.csv', index=False)

    # Plot feature importances for each month
    plot_aggregated_feature_importances(xgboost_models_this_month, predictors, output_dir, date)

    # Calculate explained variance for XGBoost, LSTM, NN, and Ensemble
    explained_variance_xgb = explained_variance_score(y_test_rescaled, xgboost_mean)
    explained_variance_lstm = explained_variance_score(y_test_rescaled, lstm_mean)
    explained_variance_nn = explained_variance_score(y_test_rescaled, nn_mean)
    explained_variance_ensemble = explained_variance_score(y_test_rescaled, ensemble_pred_rescaled)

    # Add explained variance to DataFrame
    explained_variance_df = pd.concat([explained_variance_df, pd.DataFrame({
        'Date': [date],
        'Explained Variance XGBoost': [explained_variance_xgb],
        'Explained Variance LSTM': [explained_variance_lstm],
        'Explained Variance NN': [explained_variance_nn],
        'Explained Variance Ensemble': [explained_variance_ensemble]
    })], ignore_index=True)

    print(f"Explained Variance - XGBoost: {explained_variance_xgb:.4f}")
    print(f"Explained Variance - LSTM: {explained_variance_lstm:.4f}")
    print(f"Explained Variance - NN: {explained_variance_nn:.4f}")
    print(f"Explained Variance - Ensemble: {explained_variance_ensemble:.4f}")

    # Calculate and store explained variance at the aggregated level per month
    explained_variance_lstm_df = pd.concat([explained_variance_lstm_df, calculate_aggregated_explained_variance_per_month(results_df, 'LSTM Prediction')])
    explained_variance_nn_df = pd.concat([explained_variance_nn_df, calculate_aggregated_explained_variance_per_month(results_df, 'NN Prediction')])
    explained_variance_xgb_df = pd.concat([explained_variance_xgb_df, calculate_aggregated_explained_variance_per_month(results_df, 'XGBoost Prediction')])
    explained_variance_ensemble_df = pd.concat([explained_variance_ensemble_df, calculate_aggregated_explained_variance_per_month(results_df, 'Ensemble Prediction')])

    # Aggregate true values and predicted values at the country and month level
    country_month_aggregation = results_df.groupby(['country', 'year_month']).sum(numeric_only=True).reset_index()
    country_month_aggregation.to_csv(f'{output_dir}/country_level_aggregation_{date}.csv', index=False)

    # Aggregate true values and predicted values overall by summing up the country-month-level metrics
    overall_aggregation = country_month_aggregation.sum(numeric_only=True).to_dict()
    overall_aggregation_df = pd.DataFrame([overall_aggregation])
    overall_aggregation_df.to_csv(f'{output_dir}/overall_aggregation_{date}.csv', index=False)

# Save explained variance for each model per month
explained_variance_df.to_csv(f'{output_dir}/explained_variance_by_model_per_month.csv', index=False)

# Save the aggregated explained variance per month to CSV files
explained_variance_lstm_df.to_csv(f'{output_dir}/explained_variance_per_month_LSTM.csv', index=False)
explained_variance_nn_df.to_csv(f'{output_dir}/explained_variance_per_month_NN.csv', index=False)
explained_variance_xgb_df.to_csv(f'{output_dir}/explained_variance_per_month_XGBoost.csv', index=False)
explained_variance_ensemble_df.to_csv(f'{output_dir}/explained_variance_per_month_Ensemble.csv', index=False)

# Plot combined residuals over time for each group after all forecasts
plot_combined_residuals(all_residuals)

# Plot learning curves for LSTM models
plt.figure(figsize=(14, 8))
for i, history in enumerate(lstm_history):
    plt.plot(history.history['loss'], label=f'LSTM Train {i+1}', linestyle='-')
    plt.plot(history.history['val_loss'], label=f'LSTM Val {i+1}', linestyle='--')
plt.title('Learning Curves for LSTM Models')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig(f"{output_dir}/LSTM_Learning_Curves.png")
plt.show()

# Plot learning curves for NN models
plt.figure(figsize=(14, 8))
for i, history in enumerate(nn_history):
    plt.plot(history.history['loss'], label=f'NN Train {i+1}', linestyle='-')
    plt.plot(history.history['val_loss'], label=f'NN Val {i+1}', linestyle='--')
plt.title('Learning Curves for Neural Network Models')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig(f"{output_dir}/NN_Learning_Curves.png")
plt.show()


Calculation of model weights based on inverse RMSE for each forecasted month

In [None]:
import os
import pandas as pd

# Define the directory containing the monthly performance metric files.
directory = 'Segmented Approach Outputs'

# The files are listed and sorted to ensure they are processed in chronological order.
# Only files starting with 'performance_metrics_overall_' are considered.
files = sorted([f for f in os.listdir(directory) if f.startswith('performance_metrics_overall_')])

# Initialise a list to store the calculated weights for each month.
monthly_weights = []

# An iteration is performed over each file to calculate the model weights based on inverse RMSE.
for file in files:
    date_str = file.replace('performance_metrics_overall_', '').replace('.csv', '')
    
    # The performance metrics file is loaded into a DataFrame.
    file_path = os.path.join(directory, file)
    metrics_df = pd.read_csv(file_path)

    # The RMSE values for each model (XGBoost, LSTM, and NN) are extracted.
    rmse_xgboost = metrics_df['RMSE XGBoost'].values[0]
    rmse_lstm = metrics_df['RMSE LSTM'].values[0]
    rmse_nn = metrics_df['RMSE NN'].values[0]

    # The inverse RMSE is calculated for each model.
    inv_rmse_xgboost = 1 / rmse_xgboost
    inv_rmse_lstm = 1 / rmse_lstm
    inv_rmse_nn = 1 / rmse_nn

    # The inverse RMSE values are summed to normalise the weights.
    sum_inv_rmse = inv_rmse_xgboost + inv_rmse_lstm + inv_rmse_nn

    # The weights for each model are calculated by normalising the inverse RMSE.
    weight_xgboost = inv_rmse_xgboost / sum_inv_rmse
    weight_lstm = inv_rmse_lstm / sum_inv_rmse
    weight_nn = inv_rmse_nn / sum_inv_rmse

    # The calculated weights are appended to the list, along with the corresponding date.
    monthly_weights.append({
        'Date': date_str,
        'XGBoost Weight': weight_xgboost,
        'LSTM Weight': weight_lstm,
        'NN Weight': weight_nn
    })

# The list of weights is converted into a DataFrame for easy viewing and saving.
weights_df = pd.DataFrame(monthly_weights)

# The calculated weights for each month are displayed in the console.
print(weights_df)

# The DataFrame of weights is saved to a CSV file in the specified output directory.
output_path = os.path.join('GHVT6_Outputs', 'model_weights_per_month.csv')
weights_df.to_csv(output_path, index=False)
print(f"Weights per month have been saved to {output_path}")

Analysis of correlation between model predictions

In [None]:
import numpy as np
import pandas as pd

# The predictions from the XGBoost, LSTM, and Neural Network (NN) models are averaged across all bootstrapped predictions.
# This averaging is performed along the axis 0, which corresponds to the first axis (i.e., rows).
xgb_predictions = np.mean(xgboost_bootstrap_preds, axis=0)
lstm_predictions = np.mean(lstm_bootstrap_preds, axis=0)
nn_predictions = np.mean(nn_bootstrap_preds, axis=0)

# The averaged predictions from each model are combined into a single DataFrame.
# Each column in the DataFrame corresponds to the predictions from one model: XGBoost, LSTM, and NN.
predictions_df = pd.DataFrame({
    'XGBoost': xgb_predictions,
    'LSTM': lstm_predictions,
    'NN': nn_predictions
})

# A correlation matrix is calculated to understand the relationships between the predictions of different models.
# The correlation matrix quantifies how similar the predictions from different models are to each other.
correlation_matrix = predictions_df.corr()

# The calculated correlation matrix is printed to the console.
# This provides an immediate view of the degree to which the models' predictions are correlated.
print("Correlation Matrix:")
print(correlation_matrix)


Variance Explained for ML models

In [None]:
# An empty DataFrame is initialised to store the results for all months.
# This DataFrame will accumulate the results as predictions for each month are generated.
all_results_df = pd.DataFrame()

# A list of prediction dates is defined, corresponding to the specific time periods for which forecasts will be generated.
# These dates will guide the iteration process.
prediction_dates = ['2024-01-01', '2024-02-01', '2024-03-01']

# An iteration is performed over each prediction date to generate forecasts and aggregate results.
for date in prediction_dates:
    # [Your existing code for generating predictions]
    
    # The results for the current month are appended to the main DataFrame.
    # This ensures that the results from all months are consolidated into a single DataFrame.
    all_results_df = pd.concat([all_results_df, results_df])

# The explained variance is calculated at an aggregated level for each month.
# This is done for each prediction model (LSTM, NN, XGBoost, and Ensemble).
explained_variance_lstm_df = calculate_aggregated_explained_variance_per_month(all_results_df, prediction_column='LSTM Prediction')
explained_variance_nn_df = calculate_aggregated_explained_variance_per_month(all_results_df, prediction_column='NN Prediction')
explained_variance_xgb_df = calculate_aggregated_explained_variance_per_month(all_results_df, prediction_column='XGBoost Prediction')
explained_variance_ensemble_df = calculate_aggregated_explained_variance_per_month(all_results_df, prediction_column='Ensemble Prediction')

# The results for each model are printed to the console.
# This provides immediate feedback on the performance of each model across the different months.
print("Aggregated Explained Variance per month - LSTM:")
print(explained_variance_lstm_df)
print("Aggregated Explained Variance per month - NN:")
print(explained_variance_nn_df)
print("Aggregated Explained Variance per month - XGBoost:")
print(explained_variance_xgb_df)
print("Aggregated Explained Variance per month - Ensemble:")
print(explained_variance_ensemble_df)

# The aggregated explained variance per month is saved to CSV files for each prediction model.
# These files are stored in the specified output directory for further analysis and documentation.

# LSTM model
explained_variance_lstm_df.to_csv(f'{output_dir}/explained_variance_per_month_LSTM.csv', index=False)

# NN model
explained_variance_nn_df.to_csv(f'{output_dir}/explained_variance_per_month_NN.csv', index=False)

# XGBoost model
explained_variance_xgb_df.to_csv(f'{output_dir}/explained_variance_per_month_XGBoost.csv', index=False)

# Ensemble model
explained_variance_ensemble_df.to_csv(f'{output_dir}/explained_variance_per_month_Ensemble.csv', index=False)


Calculate performance metrics across demographic groups per country of citizenship, and across countries for overall asylum applications submitted

In [None]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

# The output directory is defined where all the result files will be saved.
# If the directory does not already exist, it will be created.
output_dir = "Segmented Approach Outputs"
os.makedirs(output_dir, exist_ok=True)

# A list of prediction dates is defined, corresponding to the time periods used in the main analysis.
# These dates will guide the iteration process when loading and processing the results files.
prediction_dates = ['2024-01-01', '2024-02-01', '2024-03-01']  

# A function is defined to calculate key performance metrics: MSE, RMSE, MAE, and MDAE.
# These metrics provide insights into the accuracy and error distribution of the prediction models.
def calculate_metrics(y_true, y_pred):
    
    ### Calculate the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), ###
    ### Mean Absolute Error (MAE), and Median Absolute Error (MDAE) ### 
    
    mse = mean_squared_error(y_true, y_pred)  # Mean Squared Error
    rmse = mse ** 0.5  # Root Mean Squared Error
    mae = mean_absolute_error(y_true, y_pred)  # Mean Absolute Error
    mdae = median_absolute_error(y_true, y_pred)  # Median Absolute Error
    return mse, rmse, mae, mdae  # Return all calculated metrics

# An iteration is performed over each prediction date to load and process the corresponding results files.
# For each date, the relevant data is loaded, metrics are calculated, and the results are saved.
for date in prediction_dates:
    # The results file for the current prediction date is loaded into a DataFrame.
    results_df = pd.read_csv(f'{output_dir}/forecasts_with_uncertainty_group_level_{date}.csv')

    # Lists are initialised to collect all true values and predictions across all countries.
    all_true_values = []
    all_ensemble_preds = []
    all_xgb_preds = []
    all_lstm_preds = []
    all_nn_preds = []

    # Performance metrics are calculated across all groups per country.
    # The data is grouped by 'country' to calculate metrics for each country separately.
    grouped_results = results_df.groupby('country')

    # A list is created to store the calculated metrics for each country.
    metrics_per_country = []

    # Metrics are calculated for each country across its groups.
    for country, country_data in grouped_results:
        y_true = country_data['True Values'].values  # Actual observed values
        y_pred_ensemble = country_data['Ensemble Prediction'].values  # Ensemble model predictions
        y_pred_xgb = country_data['XGBoost Prediction'].values  # XGBoost model predictions
        y_pred_lstm = country_data['LSTM Prediction'].values  # LSTM model predictions
        y_pred_nn = country_data['NN Prediction'].values  # Neural Network model predictions

        # True values and predictions for each country are collected.
        all_true_values.extend(y_true)
        all_ensemble_preds.extend(y_pred_ensemble)
        all_xgb_preds.extend(y_pred_xgb)
        all_lstm_preds.extend(y_pred_lstm)
        all_nn_preds.extend(y_pred_nn)

        # Metrics are calculated for each country across its groups using the defined function.
        mse_ensemble, rmse_ensemble, mae_ensemble, mdae_ensemble = calculate_metrics(y_true, y_pred_ensemble)
        mse_xgb, rmse_xgb, mae_xgb, mdae_xgb = calculate_metrics(y_true, y_pred_xgb)
        mse_lstm, rmse_lstm, mae_lstm, mdae_lstm = calculate_metrics(y_true, y_pred_lstm)
        mse_nn, rmse_nn, mae_nn, mdae_nn = calculate_metrics(y_true, y_pred_nn)

        # The calculated metrics are appended to the list for each country.
        metrics_per_country.append({
            'Country': country,
            'MSE Ensemble': mse_ensemble,
            'RMSE Ensemble': rmse_ensemble,
            'MAE Ensemble': mae_ensemble,
            'MDAE Ensemble': mdae_ensemble,
            'MSE XGBoost': mse_xgb,
            'RMSE XGBoost': rmse_xgb,
            'MAE XGBoost': mae_xgb,
            'MDAE XGBoost': mdae_xgb,
            'MSE LSTM': mse_lstm,
            'RMSE LSTM': rmse_lstm,
            'MAE LSTM': mae_lstm,
            'MDAE LSTM': mdae_lstm,
            'MSE NN': mse_nn,
            'RMSE NN': rmse_nn,
            'MAE NN': mae_nn,
            'MDAE NN': mdae_nn
        })

    # The list of country metrics is converted to a DataFrame and the values are rounded to 2 decimal places.
    metrics_country_df = pd.DataFrame(metrics_per_country).round(2)

    # The metrics for each country are saved to a CSV file for the current month.
    metrics_country_df.to_csv(f'{output_dir}/performance_metrics_across_countries_{date}.csv', index=False)

    # Overall performance metrics across all countries are calculated by aggregating the true values and predictions.
    overall_mse_ensemble, overall_rmse_ensemble, overall_mae_ensemble, overall_mdae_ensemble = calculate_metrics(all_true_values, all_ensemble_preds)
    overall_mse_xgb, overall_rmse_xgb, overall_mae_xgb, overall_mdae_xgb = calculate_metrics(all_true_values, all_xgb_preds)
    overall_mse_lstm, overall_rmse_lstm, overall_mae_lstm, overall_mdae_lstm = calculate_metrics(all_true_values, all_lstm_preds)
    overall_mse_nn, overall_rmse_nn, overall_mae_nn, overall_mdae_nn = calculate_metrics(all_true_values, all_nn_preds)

    # The overall metrics across all countries are stored in a DataFrame for the current month and rounded to 2 decimal places.
    overall_metrics_df = pd.DataFrame({
        'Model': ['Ensemble', 'XGBoost', 'LSTM', 'NN'],
        'MSE': [overall_mse_ensemble, overall_mse_xgb, overall_mse_lstm, overall_mse_nn],
        'RMSE': [overall_rmse_ensemble, overall_rmse_xgb, overall_rmse_lstm, overall_rmse_nn],
        'MAE': [overall_mae_ensemble, overall_mae_xgb, overall_mae_lstm, overall_mae_nn],
        'MDAE': [overall_mdae_ensemble, overall_mdae_xgb, overall_mdae_lstm, overall_mdae_nn]
    }).round(2)

    # The overall performance metrics across all countries are saved to a CSV file for the current month.
    overall_metrics_df.to_csv(f'{output_dir}/overall_performance_metrics_{date}.csv', index=False)

    # The results for the current month are displayed in the console.
    print(f"Performance Metrics Across Groups Per Country for {date}:")
    print(metrics_country_df)

    print(f"\nOverall Performance Metrics Across All Countries for {date}:")
    print(overall_metrics_df)


Performance metrics per demographic group across countries of citizenship, and across countries for the grand total of asylum applications

In [None]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

# The output directory is defined where all the result files will be saved.
# If the directory does not exist, it is created automatically.
output_dir = "Segmented Approach Outputs"
os.makedirs(output_dir, exist_ok=True)

# A list of prediction dates is defined, corresponding to the dates used in the main code.
# These dates represent the different time periods for which performance metrics will be calculated.
prediction_dates = ['2024-01-01', '2024-02-01', '2024-03-01']  

# A function is defined to calculate key performance metrics: MSE, RMSE, MAE, and MDAE.
# The function takes the true values and predicted values as input and returns the calculated metrics.
def calculate_metrics(y_true, y_pred):
    
    #### Calculate the Mean Squared Error (MSE), Root Mean Squared Error (RMSE),
    #### Mean Absolute Error (MAE), and Median Absolute Error (MDAE).

    mse = mean_squared_error(y_true, y_pred)  # Mean Squared Error
    rmse = mse ** 0.5  # Root Mean Squared Error
    mae = mean_absolute_error(y_true, y_pred)  # Mean Absolute Error
    mdae = median_absolute_error(y_true, y_pred)  # Median Absolute Error
    return mse, rmse, mae, mdae  # Return all calculated metrics

# An iteration is performed over each prediction date to load and process the corresponding result files.
# For each date, the relevant data is loaded, metrics are calculated, and the results are saved.
for date in prediction_dates:
    # The results file for the current prediction date is loaded into a DataFrame.
    results_df = pd.read_csv(f'{output_dir}/forecasts_with_uncertainty_group_level_{date}.csv')

    # Lists are initialised to collect all true values and predictions across all countries.
    all_true_values = []
    all_ensemble_preds = []
    all_xgb_preds = []
    all_lstm_preds = []
    all_nn_preds = []

    # Performance metrics are calculated across all groups per country.
    # The data is grouped by 'country' to calculate metrics for each country separately.
    grouped_results = results_df.groupby('country')

    # A list is created to store the calculated metrics for each country.
    metrics_per_country = []

    # The calculation of metrics for each country across its groups is performed.
    for country, country_data in grouped_results:
        y_true = country_data['True Values'].values  # True values for the country
        y_pred_ensemble = country_data['Ensemble Prediction'].values  # Ensemble model predictions
        y_pred_xgb = country_data['XGBoost Prediction'].values  # XGBoost model predictions
        y_pred_lstm = country_data['LSTM Prediction'].values  # LSTM model predictions
        y_pred_nn = country_data['NN Prediction'].values  # Neural Network model predictions

        # True values and predictions for each country are collected.
        all_true_values.extend(y_true)
        all_ensemble_preds.extend(y_pred_ensemble)
        all_xgb_preds.extend(y_pred_xgb)
        all_lstm_preds.extend(y_pred_lstm)
        all_nn_preds.extend(y_pred_nn)

        # Metrics are calculated for each country across its groups using the defined function.
        mse_ensemble, rmse_ensemble, mae_ensemble, mdae_ensemble = calculate_metrics(y_true, y_pred_ensemble)
        mse_xgb, rmse_xgb, mae_xgb, mdae_xgb = calculate_metrics(y_true, y_pred_xgb)
        mse_lstm, rmse_lstm, mae_lstm, mdae_lstm = calculate_metrics(y_true, y_pred_lstm)
        mse_nn, rmse_nn, mae_nn, mdae_nn = calculate_metrics(y_true, y_pred_nn)

        # The calculated metrics are appended to the list for each country.
        metrics_per_country.append({
            'Country': country,
            'MSE Ensemble': mse_ensemble,
            'RMSE Ensemble': rmse_ensemble,
            'MAE Ensemble': mae_ensemble,
            'MDAE Ensemble': mdae_ensemble,
            'MSE XGBoost': mse_xgb,
            'RMSE XGBoost': rmse_xgb,
            'MAE XGBoost': mae_xgb,
            'MDAE XGBoost': mdae_xgb,
            'MSE LSTM': mse_lstm,
            'RMSE LSTM': rmse_lstm,
            'MAE LSTM': mae_lstm,
            'MDAE LSTM': mdae_lstm,
            'MSE NN': mse_nn,
            'RMSE NN': rmse_nn,
            'MAE NN': mae_nn,
            'MDAE NN': mdae_nn
        })

    # The list of country metrics is converted to a DataFrame and the values are rounded to 2 decimal places.
    metrics_country_df = pd.DataFrame(metrics_per_country).round(2)

    # The metrics for each country are saved to a CSV file for the current month.
    metrics_country_df.to_csv(f'{output_dir}/performance_metrics_across_countries_{date}.csv', index=False)

    # Overall performance metrics across all countries are calculated by aggregating the true values and predictions.
    overall_mse_ensemble, overall_rmse_ensemble, overall_mae_ensemble, overall_mdae_ensemble = calculate_metrics(all_true_values, all_ensemble_preds)
    overall_mse_xgb, overall_rmse_xgb, overall_mae_xgb, overall_mdae_xgb = calculate_metrics(all_true_values, all_xgb_preds)
    overall_mse_lstm, overall_rmse_lstm, overall_mae_lstm, overall_mdae_lstm = calculate_metrics(all_true_values, all_lstm_preds)
    overall_mse_nn, overall_rmse_nn, overall_mae_nn, overall_mdae_nn = calculate_metrics(all_true_values, all_nn_preds)

    # The overall metrics across all countries are stored in a DataFrame for the current month and rounded to 2 decimal places.
    overall_metrics_df = pd.DataFrame({
        'Model': ['Ensemble', 'XGBoost', 'LSTM', 'NN'],
        'MSE': [overall_mse_ensemble, overall_mse_xgb, overall_mse_lstm, overall_mse_nn],
        'RMSE': [overall_rmse_ensemble, overall_rmse_xgb, overall_rmse_lstm, overall_rmse_nn],
        'MAE': [overall_mae_ensemble, overall_mae_xgb, overall_mae_lstm, overall_mae_nn],
        'MDAE': [overall_mdae_ensemble, overall_mdae_xgb, overall_mdae_lstm, overall_mdae_nn]
    }).round(2)

    # The overall performance metrics across all countries are saved to a CSV file for the current month.
    overall_metrics_df.to_csv(f'{output_dir}/overall_performance_metrics_{date}.csv', index=False)

    # The results for the current month are displayed in the console.
    print(f"Performance Metrics Across Groups Per Country for {date}:")
    print(metrics_country_df)

    print(f"\nOverall Performance Metrics Across All Countries for {date}:")
    print(overall_metrics_df)


Ensemble prediction errors across countries of citizenship

In [None]:
import os
import pandas as pd

# The output directory is defined where all the result files will be saved.
output_dir = "Segmented Approach Outputs"

# A list of prediction dates is specified, corresponding to the dates used in the main analysis.
# These dates indicate the time periods for which error calculations will be performed.
prediction_dates = ['2024-01-01', '2024-02-01', '2024-03-01'] 

# An empty DataFrame is initialised to hold the overall errors across all countries and dates.
# This DataFrame will be incrementally built as each prediction date is processed.
overall_errors = pd.DataFrame()

# An iteration is performed over each prediction date to process the corresponding results file.
for date in prediction_dates:
    # The file path for the current prediction date's aggregation results is constructed.
    # It is checked whether the file exists before attempting to load it.
    filepath = f'{output_dir}/country_level_aggregation_{date}.csv'
    if os.path.exists(filepath):
        data_df = pd.read_csv(filepath)  # The data is loaded into a DataFrame.

        # The error is calculated as the difference between the true values and the ensemble predictions.
        # A new column 'Ensemble Error' is added to the DataFrame to store these error values.
        data_df['Ensemble Error'] = data_df['True Values'] - data_df['Ensemble Prediction']

        # The updated DataFrame, now containing the error calculations, is saved back to a CSV file.
        data_df.to_csv(f'{output_dir}/country_level_aggregation_with_error_{date}.csv', index=False)
        
        # The updated DataFrame is appended to the overall errors DataFrame.
        # This step aggregates the error data across all prediction dates.
        overall_errors = pd.concat([overall_errors, data_df], ignore_index=True)

# The errors are summarised by country and date.
# The sum of errors is calculated for each country and date combination, and the results are reset to a new index.
country_errors = overall_errors.groupby(['country', 'year_month'])['Ensemble Error'].sum().reset_index()

# The summarised errors DataFrame is sorted first by date ('year_month') and then by country.
# This ensures that the data is organised chronologically and by country for easier analysis.
country_errors = country_errors.sort_values(by=['year_month', 'country'])

# The sorted errors by country and date are saved to a CSV file.
# This file contains the detailed error information for further examination and documentation.
country_errors.to_csv(f'{output_dir}/errors_by_country_and_date.csv', index=False)

# The overall error across all countries and dates is calculated by summing the 'Ensemble Error' column.
# This provides a single metric indicating the total error in the predictions over the entire period.
total_error = overall_errors['Ensemble Error'].sum()
print(f"Total Error across all countries and dates: {total_error}")

# The total overall error is saved to a separate CSV file for documentation purposes.
overall_error_df = pd.DataFrame({'Total Error': [total_error]})
overall_error_df.to_csv(f'{output_dir}/total_error_across_all_countries.csv', index=False)

# The summarised errors by country and date are displayed.
# This provides an immediate view of the error distribution across the different countries and time periods.
print("Errors by country and date:")
print(country_errors)


Visualising asylum application forecasts with uncertainty intervals

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

# The global font for all plots is set to Times New Roman, with a default font size of 12.
# This ensures consistency in the appearance of the plots.
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12  

# The dataset containing the actual data is loaded from a CSV file.
# This dataset serves as the base for merging with forecast data.
data_agg = pd.read_csv('final_thesis_data.csv')

# An output directory is created for saving the generated graphs.
# This directory will store the final visualisations of the forecasting results.
output_dir = "Graphs - Country-Sex-AgeGroup Forecasting Output"
os.makedirs(output_dir, exist_ok=True)

# The 'year_month' column is converted to datetime format to ensure proper handling of time series data.
data_agg['year_month'] = pd.to_datetime(data_agg['year_month'])

# A dictionary is defined to map forecast dates to their corresponding CSV files.
# These files contain the forecasted data, including predictions and uncertainty intervals.
forecast_files = {
    '2024-01-01': 'Segmented Approach Outputs/forecasts_with_uncertainty_group_level_2024-01-01.csv',
    '2024-02-01': 'Segmented Approach Outputs/forecasts_with_uncertainty_group_level_2024-02-01.csv',
    '2024-03-01': 'Segmented Approach Outputs/forecasts_with_uncertainty_group_level_2024-03-01.csv'
}

# A new DataFrame is created to hold the combined actual and forecasted data.
# This DataFrame initially includes the actual data only.
all_results_df = data_agg[['country', 'sex', 'age_group', 'year_month', 'asy_applications']].copy()

# The forecast data from each file is merged with the actual data.
# This loop iterates through the forecast files, merges them with the actual data, and adds uncertainty intervals.
for date, forecast_file in forecast_files.items():
    if os.path.exists(forecast_file):
        forecast_df = pd.read_csv(forecast_file)
        forecast_df['year_month'] = pd.to_datetime(date)  # The forecast date is set.
        # Column names are updated to include the forecast date, differentiating predictions and confidence intervals.
        forecast_df.rename(columns={
            'Ensemble Prediction': f'Ensemble Prediction_{date}',
            'Ensemble Lower CI': f'Lower CI_{date}',
            'Ensemble Upper CI': f'Upper CI_{date}'
        }, inplace=True)
        # The forecast data is merged with the actual data based on common identifiers: country, sex, age_group, and year_month.
        all_results_df = pd.merge(
            all_results_df,
            forecast_df[['country', 'sex', 'age_group', 'year_month', f'Ensemble Prediction_{date}', f'Lower CI_{date}', f'Upper CI_{date}']],
            on=['country', 'sex', 'age_group', 'year_month'],
            how='left'
        )
    else:
        # A warning is printed if a forecast file is missing.
        print(f"Forecast file for {date} not found.")

# The merged data, which now includes both actual and forecasted values with uncertainty intervals, is saved to a CSV file.
merged_data_filepath = os.path.join(output_dir, 'uncertainty_segmented_approach.csv')
all_results_df.to_csv(merged_data_filepath, index=False)
print(f"Merged data saved to {merged_data_filepath}")

# A new directory is created specifically for storing the individual plots.
individual_plots_dir = "Forecasts' Plots with Uncertainty - Segmented Forecasting Approach" 
os.makedirs(individual_plots_dir, exist_ok=True)

# Plots are generated for each country, showing the true values, predictions, and uncertainty intervals across different groups.
for country, country_data in all_results_df.groupby('country'):
    # A figure with subplots is created for each country, organised by sex and age group.
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18, 12), sharex=True, sharey=True)
    axes = axes.flatten()  # The axes array is flattened for easy iteration.
    for i, ((sex, age_group), group_data) in enumerate(country_data.groupby(['sex', 'age_group'])):
        ax = axes[i]
        
        # The true asylum application values are plotted across the entire period.
        ax.plot(group_data['year_month'], group_data['asy_applications'], label='True Values', linestyle='-', color='blue')
        
        # Forecast values and confidence intervals are plotted for the forecast period only.
        forecast_mask = group_data['year_month'].isin(pd.to_datetime(list(forecast_files.keys())))
        if forecast_mask.any():
            ax.plot(group_data['year_month'], group_data[['Ensemble Prediction_2024-01-01', 'Ensemble Prediction_2024-02-01', 'Ensemble Prediction_2024-03-01']].ffill(axis=1).bfill(axis=1).iloc[:, -1], label='Predictions', linestyle='--', color='red', marker=None)
            ax.fill_between(group_data['year_month'], group_data[['Lower CI_2024-01-01', 'Lower CI_2024-02-01', 'Lower CI_2024-03-01']].min(axis=1), group_data[['Upper CI_2024-01-01', 'Upper CI_2024-02-01', 'Upper CI_2024-03-01']].max(axis=1), alpha=0.2, color='red', label='95% CI')
        
        ax.set_title(f'{sex}, {age_group}')
        ax.set_xlabel('Year Month')
        ax.set_ylabel('Asylum Applications')
        ax.legend()
        ax.grid(True)
        
        # An inset plot is added to zoom in on the forecast period.
        ax_inset = inset_axes(ax, width="30%", height="30%", loc='upper center', borderpad=2)
        ax_inset.plot(group_data['year_month'], group_data['asy_applications'], linestyle='-', color='blue')
        if forecast_mask.any():
            ax_inset.plot(group_data['year_month'], group_data[['Ensemble Prediction_2024-01-01', 'Ensemble Prediction_2024-02-01', 'Ensemble Prediction_2024-03-01']].ffill(axis=1).bfill(axis=1).iloc[:, -1], linestyle='--', color='red', marker=None)
            ax_inset.fill_between(group_data['year_month'], group_data[['Lower CI_2024-01-01', 'Lower CI_2024-02-01', 'Lower CI_2024-03-01']].min(axis=1), group_data[['Upper CI_2024-01-01', 'Upper CI_2024-02-01', 'Upper CI_2024-03-01']].max(axis=1), alpha=0.5, color='red')
        
        # The inset plot is configured to focus on the forecast period.
        ax_inset.set_xlim(pd.to_datetime('2023-12-01'), pd.to_datetime('2024-03-31'))
        ax_inset.xaxis.set_major_locator(mdates.MonthLocator())
        ax_inset.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        fig.autofmt_xdate()

    # The layout of the figure is adjusted to ensure plots are properly arranged and not overlapping.
    plt.tight_layout()
    
    # The plot for the current country is saved to the designated directory.
    plt.savefig(f'{individual_plots_dir}/Forecast_Comparison_{country}.png')
    plt.close()

print(f"Plots saved in directory: {individual_plots_dir}")
