Evaluating Time Series Stationarity in Asylum Applications Across Demographic Groups

In [5]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import tensorflow as tf
import random
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, median_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, train_test_split
import xgboost as xgb
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM, Attention, Input, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt

# A random seed is set for reproducibility across various libraries and frameworks.
# This ensures that the results are consistent each time the code is run.
random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
random.seed(random_seed)

# Reproducibility is further ensured within TensorFlow 2.x by setting the random seed
# and enabling deterministic operations.
tf.keras.utils.set_random_seed(random_seed)
tf.config.experimental.enable_op_determinism()

# An output directory for graphs is created if it does not already exist.
# This directory will be used to store the results and visualisations generated by the code.
output_dir = "GHVT6_Outputs"
os.makedirs(output_dir, exist_ok=True)

# The dataset is loaded from a CSV file and preprocessed.
# The 'year_month' column is converted to datetime format for accurate time series handling.
# The data is then sorted by 'country' and 'year_month' to ensure the time series order is maintained.
data = pd.read_csv("final_thesis_data.csv")
data['year_month'] = pd.to_datetime(data['year_month'])
data = data.sort_values(by=['country', 'year_month'])

# The data is aggregated by 'country', 'year_month', 'sex', and 'age_group'.
# This step ensures that the data is grouped correctly for subsequent analysis, with the sum of relevant metrics being calculated within each group.
data_agg = data.groupby(['country', 'year_month', 'sex', 'age_group']).sum().reset_index()

# A function is defined to perform the Augmented Dickey-Fuller (ADF) test on a given time series.
# The ADF test is used to determine whether a time series is stationary.
# The function returns the results of the test as a dictionary, including the test statistic, p-value, number of lags used, and other relevant metrics.
def adf_test(series, title='ADF Test'):
    ### Perform ADF test and return the result as a dictionary ### 
    result = adfuller(series.dropna(), autolag='AIC')
    return {
        'Test Statistic': result[0],
        'p-value': result[1],
        'Number of Lags Used': result[2],
        'Number of Observations Used': result[3],
        'Critical Values': result[4],
        'Stationary': result[1] < 0.05
    }

# The ADF test is applied to each group of data, segmented by 'country', 'sex', and 'age_group'.
# The lambda function is used to apply the adf_test function to the 'asy_applications' column within each group.
# This operation determines the stationarity of the asylum application time series for each demographic segment.
adf_results = data_agg.groupby(['country', 'sex', 'age_group']).apply(
    lambda group: adf_test(group['asy_applications'])
)

# The results of the ADF tests are converted into a DataFrame for easier analysis and interpretation.
# Each row in the DataFrame corresponds to a group, with the ADF test results presented as columns.
adf_results_df = pd.DataFrame(adf_results.tolist(), index=adf_results.index)

# The ADF test results are printed to the console for immediate inspection.
print(adf_results_df)

# The ADF test results are saved to a CSV file in the previously defined output directory.
# This allows for the results to be preserved and referenced later, outside of the runtime environment.
adf_results_df.to_csv(f'{output_dir}/adf_test_results.csv')


                              Test Statistic   p-value  Number of Lags Used  \
country     sex    age_group                                                  
Afghanistan female adult           -2.443236  0.129903                    6   
                   minor           -2.464185  0.124433                    8   
            male   adult           -2.025220  0.275626                    7   
                   minor           -2.524094  0.109722                    6   
Albania     female adult           -1.900248  0.331944                    4   
...                                      ...       ...                  ...   
Türkiye     male   minor           -3.484571  0.008392                    0   
Yemen       female adult           -1.174423  0.684500                    7   
                   minor           -1.226084  0.662276                    8   
            male   adult            1.506696  0.997549                   14   
                   minor           -4.504221  0.0001

  adf_results = data_agg.groupby(['country', 'sex', 'age_group']).apply(


In [6]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

# The output directory is set where all resulting files, including CSVs, will be saved.
output_dir = "GHVT6_Outputs"
os.makedirs(output_dir, exist_ok=True)

# The data is loaded from a CSV file into a DataFrame.
# The 'year_month' column is converted to datetime format to facilitate time series analysis.
data_agg = pd.read_csv("final_thesis_data.csv")
data_agg['year_month'] = pd.to_datetime(data_agg['year_month'])

# The data is aggregated by 'country', 'year_month', 'sex', and 'age_group'.
# This aggregation ensures that the analysis is conducted on the summed data within each group, maintaining the structure necessary for subsequent operations.
data_agg = data_agg.groupby(['country', 'year_month', 'sex', 'age_group']).sum().reset_index()

# An empty list is initialised to store the results of the Augmented Dickey-Fuller (ADF) tests.
# Each group's ADF results will be appended to this list as a dictionary.
adf_results = []

# The ADF test is performed on the asylum application series within each group.
# The data is grouped by 'country', 'sex', and 'age_group', and the ADF test is applied to each group's time series.
# The test statistic, p-value, and a boolean indicating whether the series is stationary (at a 5% significance level) are stored.
for (country, sex, age_group), group_data in data_agg.groupby(['country', 'sex', 'age_group']):
    series = group_data['asy_applications']
    adf_result = adfuller(series, autolag='AIC')
    
    adf_results.append({
        'country': country,
        'sex': sex,
        'age_group': age_group,
        'ADF Statistic': adf_result[0],
        'p-value': adf_result[1],
        'Stationary': adf_result[1] < 0.05  # Assuming 5% significance level
    })

# The list of ADF results is converted into a DataFrame for easier manipulation and analysis.
adf_results_df = pd.DataFrame(adf_results)

# A count of stationary and non-stationary time series is performed.
# The boolean column 'Stationary' is used to summarise the number of stationary versus non-stationary series.
stationary_counts = adf_results_df['Stationary'].value_counts()

# A summary of the stationarity results is printed to the console.
# This provides a quick overview of how many time series were found to be stationary or non-stationary.
print("\nSummary of Stationarity:")
print(f"Stationary: {stationary_counts.get(True, 0)}")
print(f"Non-stationary: {stationary_counts.get(False, 0)}")

# The non-stationary time series are identified and listed.
# A subset of the DataFrame is created to include only the non-stationary series.
# These results are printed for immediate inspection.
non_stationary_series = adf_results_df[adf_results_df['Stationary'] == False]
print("\nNon-stationary series:")
print(non_stationary_series)

# The details of the non-stationary series are saved to a CSV file in the output directory.
# This file serves as a record of which time series require further attention or differencing.
non_stationary_series.to_csv(f'{output_dir}/non_stationary_series.csv', index=False)

# The stationary time series are identified and listed.
# A subset of the DataFrame is created to include only the stationary series.
# These results are printed for review.
stationary_series = adf_results_df[adf_results_df['Stationary'] == True]
print("\nStationary series:")
print(stationary_series)

# The details of the stationary series are saved to a CSV file in the output directory.
# This allows for future reference and analysis of series that do not require differencing.
stationary_series.to_csv(f'{output_dir}/stationary_series.csv', index=False)



Summary of Stationarity:
Stationary: 25
Non-stationary: 47

Non-stationary series:
                             country     sex age_group  ADF Statistic  \
0                        Afghanistan  female     adult      -2.443236   
1                        Afghanistan  female     minor      -2.464185   
2                        Afghanistan    male     adult      -2.025220   
3                        Afghanistan    male     minor      -2.524094   
4                            Albania  female     adult      -1.900248   
5                            Albania  female     minor      -1.788712   
6                            Albania    male     adult      -1.859991   
7                            Albania    male     minor      -1.673132   
11                        Bangladesh    male     minor      -2.209940   
12                          Cameroon  female     adult      -1.745368   
13                          Cameroon  female     minor      -1.996416   
14                          Cameroon    