 Country-level time series plots for 'push factor' indices

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# The data is loaded from a CSV file and preprocessed by converting the 'year_month' column to datetime format.
# The data is then sorted by 'country' and 'year_month' to ensure proper chronological order.
data = pd.read_csv("final_thesis_data.csv")
data['year_month'] = pd.to_datetime(data['year_month'])
data = data.sort_values(by=['country', 'year_month'])

# The data is aggregated by 'country', 'year_month', 'sex', and 'age_group'.
# This aggregation is performed to sum up all relevant metrics within these groups.
data_agg = data.groupby(['country', 'year_month', 'sex', 'age_group']).sum().reset_index()

# The font for all plots is globally set to Times New Roman to ensure consistency across visualisations.
plt.rcParams['font.family'] = 'Times New Roman'

# Duplicate rows are dropped to ensure no aggregation is mistakenly done on 'sex' and 'age_group'.
# This step isolates the data at the country and time level, removing unnecessary duplications.
data_no_agg = data_agg.drop_duplicates(subset=['country', 'year_month', 'push_factor_index'])

# A directory is created to save the generated plots, ensuring the output path exists.
output_dir = 'GHVT6_Outputs/GDELT PFI Plots'
os.makedirs(output_dir, exist_ok=True)

# A list of variables is defined, each of which will be plotted separately.
variables = ['push_factor_index', 'Conflict', 'Governance', 'Social', 'Economic', 'Political']

# The code loops through each variable to create and save the corresponding plot.
for var in variables:
    # A large figure is created to accommodate multiple subplots in a grid.
    fig = plt.figure(figsize=(20, 16))
    
    # A FacetGrid is used to plot each country in a separate panel, allowing for comparison across countries.
    g = sns.FacetGrid(data_no_agg, col="country", col_wrap=4, height=3.5, aspect=1.5, sharey=False, sharex=True)
    
    # The lineplot is mapped to each facet, plotting the selected variable over time.
    g.map(sns.lineplot, 'year_month', var)
    
    # The axis labels and titles are adjusted for clarity, converting underscores to spaces and capitalising words.
    g.set_axis_labels('Year', var.replace('_', ' ').title())
    g.set_titles('{col_name}')
    
    # The x-axis labels are rotated to 45 degrees for better readability, with alignment adjusted for aesthetics.
    for ax in g.axes.flat:
        ax.tick_params(axis='x', rotation=45)
        for label in ax.get_xticklabels():
            label.set_horizontalalignment('right')
    
    # A main title is set for the entire grid of plots to provide context for the visualisation.
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle(f'{var.replace("_", " ").title()} Event Trends Over Time by Country', fontsize=16)
    
    # A source annotation is added at the bottom of the plot, acknowledging the data source.
    plt.figtext(0.5, 0.01, 'Source: GDELT (2024)', ha='center', fontsize=12, style='italic')
    
    # The layout is adjusted to ensure there is sufficient spacing around the plots and titles.
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    
    # The plot is saved as a PNG file in the specified output directory, with the filename reflecting the variable plotted.
    output_path = os.path.join(output_dir, f'{var}_trends.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    
    # The plot is closed to free up memory, especially useful when generating multiple plots in a loop.
    plt.close()

# A confirmation message is printed, indicating that all plots have been saved in the specified directory.
print(f'Plots saved in the directory: {output_dir}')


Plots saved in the directory: GHVT6_Outputs/push_factor_plots


<Figure size 2000x1600 with 0 Axes>

<Figure size 2000x1600 with 0 Axes>

<Figure size 2000x1600 with 0 Axes>

<Figure size 2000x1600 with 0 Axes>

<Figure size 2000x1600 with 0 Axes>

<Figure size 2000x1600 with 0 Axes>