In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
prediction_year: int = 2022

country_list = pd.read_csv('../data/country_list.csv')

def country_map(country_id: int) -> str:
    return country_list \
               [country_list['country_id'] == country_id]. \
               name. \
               reset_index(drop=True) \
               [0]

country_map(57)

# Data

In [None]:
df_new = pd.read_parquet(f'../actuals_new/cm/window=Y{prediction_year}/cm_actuals_{prediction_year}.parquet')
df_old = pd.read_parquet(f'../actuals_preliminary/cm/window=Y{prediction_year}/cm_actuals_{prediction_year}.parquet')

# reset index
df_new.reset_index(drop=False, inplace=True)
df_old.reset_index(drop=False, inplace=True)
# df1 = df1[['month_id', 'country_id', 'outcome']]
# df2 = df2[['month_id', 'country_id', 'ged_sb']]
# df1
# df2
df_new.rename(columns={'outcome': 'ged_sb'}, inplace=True)

In [None]:
df_new.head()

In [None]:
# Merge DataFrames on 'month_id' and 'country_id' with outer join to handle missing rows
merged_df = pd.merge(df_new, df_old, on=['month_id', 'country_id'], how='outer', suffixes=('_new', '_old'))

# Identify rows where 'ged_sb' values differ or are missing in one DataFrame
differences_df = merged_df[merged_df['ged_sb_new'] != merged_df['ged_sb_old']]

# calculate the difference between the two values
differences_df['ged_sb_diff'] = differences_df['ged_sb_new'] - differences_df['ged_sb_old']

# absolute differense
differences_df['ged_sb_diff_abs'] = abs(differences_df['ged_sb_diff'])

# drop 'index'
differences_df.drop(columns=['index'], inplace=True)

# print sum of differences
print("Absolute sum of differences: ", abs(differences_df['ged_sb_diff']).sum())

# Display the result
differences_df.head()

# Differences

In [None]:
# amount of country_ids in df_new vs df_old
print(len(df_new['country_id'].unique()))
print(len(df_old['country_id'].unique()))

# print difference in country_ids
print(set(df_new['country_id'].unique()) - set(df_old['country_id'].unique()))

# check if amount of months available per each country_id is the same
df_new_months_per_country = df_new.groupby('country_id').count()
df_old_months_per_country = df_old.groupby('country_id').count()

df_new_months_per_country = df_new_months_per_country.reset_index()
df_old_months_per_country = df_old_months_per_country.reset_index()
# assert all true
print("All the same countries and months are present:")
print(
    (
        df_new_months_per_country[["country_id", "month_id"]] == df_old_months_per_country[["country_id", "month_id"]]
    ).all().all()
)

In [None]:
# Huge difference
diff5 = differences_df[differences_df['ged_sb_diff_abs'] >= 500]['country_id'].unique()
print(f'ids: {diff5}')
print(f'names: {[country_map(country_id) for country_id in diff5]}')

In [None]:
# Big difference
diff4 = differences_df[(differences_df['ged_sb_diff_abs'] >= 100) & (differences_df['ged_sb_diff_abs'] < 500)]['country_id'].unique()
diff4 = list(set(diff4) - set(diff5))
print(f'ids: {diff4}')
print(f'names: {[country_map(country_id) for country_id in diff4]}')

In [None]:
# Mediocre difference
diff3 = differences_df[(differences_df['ged_sb_diff_abs'] >= 50) & (differences_df['ged_sb_diff_abs'] < 100)]['country_id'].unique()
diff3 = list(set(diff3) - set(diff4) - set(diff5))
print(f'ids: {diff3}')
print(f'names: {[country_map(country_id) for country_id in diff3]}')

In [None]:
# Small difference
diff2 = differences_df[(differences_df['ged_sb_diff_abs'] >= 10) & (differences_df['ged_sb_diff_abs'] < 50)]['country_id'].unique()
diff2 = list(set(diff2) - set(diff3) - set(diff4) - set(diff5))
print(f'ids: {diff2}')
print(f'names: {[country_map(country_id) for country_id in diff2]}')

In [None]:
# Tiny difference
diff1 = differences_df[differences_df['ged_sb_diff_abs'] < 10]['country_id'].unique()
diff1 = list(set(diff1) - set(diff2) - set(diff3) - set(diff4) - set(diff5))
print(f'ids: {diff1}')
print(f'names: {[country_map(country_id) for country_id in diff1]}')

# Plots

In [None]:
country_id: int = 162

x_new = df_new[df_new['country_id'] == country_id].month_id
y_new = df_new[df_new['country_id'] == country_id].ged_sb
x_old = df_old[df_old['country_id'] == country_id].month_id
y_old = df_old[df_old['country_id'] == country_id].ged_sb

plt.plot(x_new, y_new, label='new')
plt.plot(x_old, y_old, label='old')

plt.title(f'ID: {country_id}, Name: {country_map(country_id)}')
plt.xlabel('month')
plt.ylabel('fatalities')

plt.legend()

plt.tight_layout()
plt.plot()