Segmented Forecasting Approach - Trivial Model Analysis for Asylum Applications

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

# The data is loaded from a CSV file and then preprocessed.
# The 'year_month' column is converted to datetime format and the data is sorted.
data = pd.read_csv("final_thesis_data.csv")
data['year_month'] = pd.to_datetime(data['year_month'])
data = data.sort_values(by=['country', 'year_month'])

# The year and month are extracted from the 'year_month' column for later use.
data['year'] = data['year_month'].dt.year
data['month'] = data['year_month'].dt.month

# Data is aggregated by country, year_month, sex, and age_group.
# This aggregation will be used to compute trivial model predictions.
data_agg = data.groupby(['country', 'year_month', 'sex', 'age_group']).sum().reset_index()

# The mean asylum applications for each month are calculated over the last three years.
# This forms the basis of the trivial model, which will be used for comparison.
data_agg['month'] = data_agg['year_month'].dt.month
data_agg['year'] = data_agg['year_month'].dt.year

# The trivial model is defined as the average number of asylum applications for each month over the last three years.
monthly_means_3_years = (
    data_agg
    .groupby(['country', 'sex', 'age_group', 'month'])
    .apply(lambda x: x.loc[(x['year'] >= (x['year'].max() - 3)), 'asy_applications'].mean())
    .reset_index()
)
monthly_means_3_years = monthly_means_3_years.rename(columns={0: 'trivial_prediction'})

# The trivial predictions are merged back into the aggregated data for further analysis.
data_agg = pd.merge(data_agg, monthly_means_3_years, on=['country', 'sex', 'age_group', 'month'], how='left')

# The forecast dates are defined, and performance metrics for the trivial model are calculated for each date.
forecast_dates = ['2024-01-01', '2024-02-01', '2024-03-01']  

# Lists are initialised to store metrics for each forecast date.
forecast_metrics = []
all_group_forecasts = []
monthly_aggregates = []

# Loop through each forecast date.
for date in forecast_dates:
    # The data is filtered to the specific forecast date.
    forecast_data = data_agg[data_agg['year_month'] == date]

    if not forecast_data.empty:
        y_true = forecast_data['asy_applications']
        y_pred = forecast_data['trivial_prediction']
        
        # Performance metrics for the Trivial Model are calculated.
        mse_trivial = mean_squared_error(y_true, y_pred)
        rmse_trivial = np.sqrt(mse_trivial)
        mae_trivial = mean_absolute_error(y_true, y_pred)
        mdae_trivial = median_absolute_error(y_true, y_pred)
        
        # The variance explained by the trivial model is calculated.
        variance_true = np.var(y_true)
        explained_variance = 1 - (mse_trivial / variance_true)

        # Metrics for the trivial model are stored.
        forecast_metrics.append({
            'forecast_month': date,
            'MSE_Trivial_Model': mse_trivial,
            'RMSE_Trivial_Model': rmse_trivial,
            'MAE_Trivial_Model': mae_trivial,
            'MDAE_Trivial_Model': mdae_trivial,
            'Explained Variance': explained_variance
        })
        
        # The forecasted month is added to the data, and group-level forecasts are stored.
        forecast_data['forecast_month'] = date
        group_forecasts = forecast_data[['country', 'sex', 'age_group', 'forecast_month', 'year_month', 'asy_applications', 'trivial_prediction']]
        all_group_forecasts.append(group_forecasts)
        
        # The forecasts and true values are aggregated per month across all groups.
        monthly_sum = forecast_data.groupby('year_month')[['asy_applications', 'trivial_prediction']].sum().reset_index()
        monthly_sum['forecast_month'] = date
        monthly_aggregates.append(monthly_sum)

        # The group-level forecasts for the current month are printed.
        print(f"Forecasts for {date}:")
        print(group_forecasts)

# All group forecasts are combined into a single DataFrame.
all_group_forecasts_df = pd.concat(all_group_forecasts)

# All monthly aggregates are combined into a single DataFrame.
monthly_aggregates_df = pd.concat(monthly_aggregates)

# The results are saved in the specified directory.
output_dir = "GHVT6_Outputs/Trivial Models Outputs"
os.makedirs(output_dir, exist_ok=True)

forecast_metrics_df = pd.DataFrame(forecast_metrics)
forecast_metrics_df.to_excel(os.path.join(output_dir, 'metrics_per_forecasted_month_trivial_segmented.xlsx'), index=False)

# The group-level forecasts are saved to a separate Excel file.
all_group_forecasts_df.to_excel(os.path.join(output_dir, 'group_forecasts_trivial_segmented.xlsx'), index=False)

# The monthly aggregated forecasts and true values are saved to a separate Excel file.
monthly_aggregates_df.to_excel(os.path.join(output_dir, 'monthly_aggregates_trivial_vs_true_segmented.xlsx'), index=False)

print("Metrics Per Forecasted Month:")
print(forecast_metrics_df)


Aggregated Forecasting Approach - Trivial Model Analysis for Asylum Applications

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

# The dataset is loaded and preprocessed, ensuring that dates are handled correctly
data = pd.read_csv("final_thesis_data.csv")
data['year_month'] = pd.to_datetime(data['year_month'])
data = data.sort_values(by=['country', 'year_month'])

# Year and month are extracted from the 'year_month' column for easier aggregation and analysis
data['year'] = data['year_month'].dt.year
data['month'] = data['year_month'].dt.month

# The data is aggregated by country and year_month, summing across all demographic groups (sex and age)
data_agg = data.groupby(['country', 'year_month']).sum().reset_index()

# The trivial model is defined by calculating the mean of historical values for each month, 
# considering only the last 3 years of data
data_agg['month'] = data_agg['year_month'].dt.month
data_agg['year'] = data_agg['year_month'].dt.year

# Monthly means for the last 3 years are calculated and stored in a DataFrame
monthly_means_3_years = (
    data_agg
    .groupby(['country', 'month'])
    .apply(lambda x: x.loc[(x['year'] >= (x['year'].max() - 3)), 'asy_applications'].mean())
    .reset_index()
)
monthly_means_3_years = monthly_means_3_years.rename(columns={0: 'trivial_prediction'})

# The trivial model predictions are merged back into the aggregated data
data_agg = pd.merge(data_agg, monthly_means_3_years, on=['country', 'month'], how='left')

# The forecast dates and file paths for the trivial model are defined
forecast_dates = ['2024-01-01', '2024-02-01', '2024-03-01'] 

# A list is initialised to store performance metrics for each forecast date
forecast_metrics = []
all_group_forecasts = []
monthly_aggregates = []

# The loop iterates over each forecast date to calculate the performance metrics for the trivial model
for date in forecast_dates:
    # Data for the specific forecast date is filtered
    forecast_data = data_agg[data_agg['year_month'] == date]

    # If data exists for the forecast date, performance metrics are calculated
    if not forecast_data.empty:
        y_true = forecast_data['asy_applications']
        y_pred = forecast_data['trivial_prediction']
        
        # Performance metrics for the Trivial Model are calculated
        mse_trivial = mean_squared_error(y_true, y_pred)
        rmse_trivial = np.sqrt(mse_trivial)
        mae_trivial = mean_absolute_error(y_true, y_pred)
        mdae_trivial = median_absolute_error(y_true, y_pred)
        
        # The variance explained by the trivial model is calculated
        variance_true = np.var(y_true)
        explained_variance = 1 - (mse_trivial / variance_true)

        # Metrics for the Trivial Model are stored in a list
        forecast_metrics.append({
            'forecast_month': date,
            'MSE_Trivial_Model': mse_trivial,
            'RMSE_Trivial_Model': rmse_trivial,
            'MAE_Trivial_Model': mae_trivial,
            'MDAE_Trivial_Model': mdae_trivial,
            'Explained Variance': explained_variance  # The explained variance is added
        })
        
        # A column for the forecasted month is added to the data
        forecast_data['forecast_month'] = date
        
        # The group-level forecasts are stored for further analysis
        group_forecasts = forecast_data[['country', 'forecast_month', 'year_month', 'asy_applications', 'trivial_prediction']]
        all_group_forecasts.append(group_forecasts)
        
        # The forecasts and true values are aggregated per month across all countries
        monthly_sum = forecast_data.groupby('year_month')[['asy_applications', 'trivial_prediction']].sum().reset_index()
        monthly_sum['forecast_month'] = date
        monthly_aggregates.append(monthly_sum)

        # The group-level forecasts for the current month are printed
        print(f"Forecasts for {date}:")
        print(group_forecasts)

# All group forecasts are combined into a single DataFrame
all_group_forecasts_df = pd.concat(all_group_forecasts)

# All monthly aggregates are combined into a single DataFrame
monthly_aggregates_df = pd.concat(monthly_aggregates)

# The results are saved to the specified directory
forecast_metrics_df = pd.DataFrame(forecast_metrics)
forecast_metrics_df.to_excel('GHVT6_Outputs/Trivial Models Outputs/metrics_per_forecasted_month_trivial_aggregated.xlsx', index=False)

# The group-level forecasts are saved to a separate Excel file
all_group_forecasts_df.to_excel('GHVT6_Outputs/Trivial Models Outputs/group_forecasts_trivial_aggregated.xlsx', index=False)

# The monthly aggregated forecasts and true values are saved to a separate Excel file
monthly_aggregates_df.to_excel('GHVT6_Outputs/Trivial Models Outputs/monthly_aggregates_trivial_vs_true_aggregated.xlsx', index=False)

print("Metrics Per Forecasted Month:")
print(forecast_metrics_df)
