In [1]:
import json
import numpy as np
import pandas as pd
from scipy.stats.mstats import gmean

In [2]:
df = pd.read_csv('../output/primary_data_preprocessed.csv')
df = df.set_index(['country', 'primary_index', 'decile'])

In [3]:
def weighted_geometric_mean(data: list[tuple[pd.Series | pd.DataFrame, float]]):
    # computed in log space for numerical stability
    wsum = 0
    logsum = 0
    for values, weight in data:
        if weight != 0:
            logsum += weight * np.log(values)
            wsum += weight
    
    return np.exp(logsum / wsum)

In [4]:
with open('../data/ewbi_indicators.json') as f:
    config = json.load(f)['EWBI']

In [6]:
all_codes = set()
all_secondaries = set()
for prio in config:
    print(prio['name'])
    for component in prio['components']:
        print('\t', component['name'])
        print(f'\t\t {component['name']}: {component['weight']}')
        all_secondaries.add(component['name'])
        for ind in component['indicators']:
            if ind['code']:
                print(f'\t\t\t {ind['code']}: {ind['weight']}')
                all_codes.add(ind['code'])
    print()

Agriculture and Food
	 Nutrition need
		 Nutrition need: 2/3
			 AN-SILC-1: 1
			 AN-EHIS-1: 2
	 Nutrition expense
		 Nutrition expense: 1/3
			 AE-HBS-1: 1
			 AE-HBS-2: 1
			 AE-EHIS-1: 2

Energy and Housing
	 Housing quality
		 Housing quality: 1/3
			 HQ-SILC-1: 2
			 HQ-SILC-2: 2
	 Energy
		 Energy: 1/3
			 HE-SILC-1: 2
			 HE-SILC-2: 1
	 Housing expense
		 Housing expense: 1/3
			 HH-SILC-1: 1
			 HH-HBS-1: 1
			 HH-HBS-2: 1
			 HH-HBS-3: 1
			 HH-HBS-4: 1

Equality
	 Life satisfaction
		 Life satisfaction: 1/4
			 EL-SILC-1: 2
			 EL-EHIS-1: 2
	 Security
		 Security: 1/4
			 ES-SILC-1: 1
			 ES-SILC-2: 1
	 Community
		 Community: 1/4
			 EC-SILC-1: 2
			 EC-SILC-2: 2
			 EC-HBS-1: 1
			 EC-HBS-2: 1
			 EC-EHIS-1: 2
	 Digital Skills
		 Digital Skills: 1/4
			 ED-EHIS-1: 2

Health and Animal Welfare
	 Health condition and impact
		 Health condition and impact: 1/2
			 AH-SILC-1: 2
			 AH-SILC-2: 2
			 AH-SILC-3: 2
			 AH-SILC-4: 2
			 AH-EHIS-1: 2
			 AH-EHIS-2: 2
	 Health cost an

In [7]:
print("Present in json file but not in index:", all_codes.difference(df.index.get_level_values('primary_index')))
print("Present in index but not in json file:", df.index.get_level_values('primary_index').difference(all_codes))

Present in json file but not in index: {'RU-LFS-1', 'IS-SILC-1', 'IS-SILC-2'}
Present in index but not in json file: Index([], dtype='object', name='primary_index')


In [8]:
# shouldn't be neeed in the future as preprocess shouldn't contain any NaN in the future
# but for now with some bugs it does du to some indicators being almost always 0
# in the meantime, we replace by a default value of 0.5
df = df.fillna(0.5)

In [None]:
secondary = {}
# separate countries as indicators aren't all available for all countries
for country, cdf in df.groupby('country'):
    #print("EL-SILC-1" in cdf.loc[country].index)
    #print(cdf.loc[country, 'EL-SILC-1'])
    cdf = cdf.loc[country]
    for prio in config:
        for component in prio['components']:
            logsum = 0
            wsum = 0
            for ind in component['indicators']:
                code = ind['code']
                weight = ind['weight']
                if code in cdf.index and weight != 0:
                    logsum += np.log(cdf.loc[code])
                    wsum += weight
            if wsum == 0:
                print('Missing', country, component['name'])
            else:
                secondary[country, prio['name'], component['name']] = np.exp(logsum / wsum)

secondary = pd.concat(secondary, names=('country', 'eu_priority', 'secondary_indicator'))

Missing CH Nutrition expense
Missing CH Digital Skills
Missing CH Accidents and addictive behaviour
Missing CH Education expense
Missing CZ Education
Missing EA Nutrition need
Missing EA Nutrition expense
Missing EA Housing quality
Missing EA Energy
Missing EA Housing expense
Missing EA Life satisfaction
Missing EA Security
Missing EA Community
Missing EA Digital Skills
Missing EA Health condition and impact
Missing EA Health cost and medical care
Missing EA Accidents and addictive behaviour
Missing EA Education
Missing EA Education expense
Missing EA Leisure and culture
Missing EA Type of job and market participation
Missing EA Unemployment
Missing EA Tourism
Missing ES Digital Skills
Missing ES Accidents and addictive behaviour
Missing EU Nutrition need
Missing EU Nutrition expense
Missing EU Housing quality
Missing EU Energy
Missing EU Housing expense
Missing EU Life satisfaction
Missing EU Security
Missing EU Community
Missing EU Digital Skills
Missing EU Health condition and impac

In [42]:
secondary.to_csv('../output/secondary_indicators.csv')

In [43]:
print(all_secondaries.difference(secondary.index.get_level_values('secondary_indicator')))
print(secondary.index.get_level_values('secondary_indicator').difference(all_secondaries))

set()
Index([], dtype='object', name='secondary_indicator')


In [46]:
secondary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
country,eu_priority,secondary_indicator,decile,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AT,Agriculture and Food,Nutrition need,1,0.580133,0.596725,0.572088,0.619895,0.647882,0.639096,0.623293,0.602288,0.575471,0.594069,0.645856,0.607507,0.551033,0.541888,0.538362,0.501497,0.588987,0.519206,0.597954,0.581347
AT,Agriculture and Food,Nutrition need,2,0.554855,0.505512,0.541583,0.524952,0.650762,0.575351,0.598363,0.515072,0.527063,0.536836,0.585900,0.587624,0.515956,0.540868,0.545272,0.560219,0.542413,0.551500,0.653489,0.615754
AT,Agriculture and Food,Nutrition need,3,0.484119,0.452037,0.500104,0.492230,0.513868,0.542108,0.538776,0.479879,0.454144,0.492706,0.514576,0.520571,0.529782,0.465616,0.453198,0.451712,0.491577,0.450536,0.619951,0.572415
AT,Agriculture and Food,Nutrition need,4,0.492833,0.447097,0.443149,0.480092,0.554084,0.500182,0.489129,0.485833,0.450664,0.505763,0.419016,0.535245,0.528114,0.540063,0.418749,0.419094,0.472274,0.582813,0.568825,0.551214
AT,Agriculture and Food,Nutrition need,5,0.483813,0.473257,0.477752,0.449609,0.586639,0.557028,0.486399,0.365106,0.442628,0.433879,0.509581,0.524551,0.403977,0.537953,0.432808,0.392374,0.465880,0.420830,0.597095,0.458470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UK,Sustainable Transport and Tourism,Tourism,6,0.724993,0.724081,0.744643,0.747607,0.730863,0.719097,0.725576,0.695329,0.687180,0.695287,0.685899,0.876431,0.889659,0.899344,0.894913,0.865060,0.695756,0.709428,0.714522,0.720596
UK,Sustainable Transport and Tourism,Tourism,7,0.745031,0.740952,0.741053,0.745918,0.708576,0.718539,0.714711,0.705579,0.690724,0.690448,0.684364,0.869476,0.874886,0.918575,0.893410,0.895256,0.702166,0.719231,0.728711,0.727596
UK,Sustainable Transport and Tourism,Tourism,8,0.760814,0.758836,0.748799,0.752156,0.742058,0.727514,0.729009,0.717453,0.719117,0.702047,0.714489,0.891014,0.908002,0.915363,0.910057,0.889404,0.895007,0.894502,0.903895,0.855655
UK,Sustainable Transport and Tourism,Tourism,9,0.769893,0.764236,0.766855,0.764215,0.749537,0.749556,0.735785,0.737055,0.712399,0.720091,0.709987,0.910451,0.884495,0.949137,0.912472,0.913598,0.904478,0.907052,0.899952,0.878517


In [57]:
priorities = {}
for country, cdf in secondary.groupby('country'):
    cdf = cdf.loc[country]
    for prio in config:
        pname = prio['name']
        if pname in cdf.index:
            cpdf = cdf.loc[pname]
            logsum = 0
            wsum = 0
            for c in prio['components']:
                name = c['name']
                weight = c['weight']
                try:
                    weight = float(weight)
                except ValueError:
                    numerator, denominator = map(int, weight.split('/'))
                    weight = float(numerator) / denominator

                if name in cpdf.index and weight != 0:
                    logsum += np.log(cpdf.loc[name])
                    wsum += weight

            if wsum == 0:
                print('Missing', country, pname)
            else:
                priorities[country, pname] = np.exp(logsum / wsum)

priorities = pd.concat(priorities, names=['country', 'eu_priority'])

In [60]:
priorities.to_csv('../output/eu_priorities.csv')

In [68]:
ewbi = {}
for country, cdf in priorities.groupby('country'):
    cdf = cdf.loc[country]
    factors = [(cdf.loc[prio], 1)  for prio in cdf.index.get_level_values('eu_priority')]
    ewbi[country] = weighted_geometric_mean(factors)
    
ewbi = pd.concat(ewbi, names=['country'])

In [71]:
ewbi.to_csv('../output/ewbi_results.csv')