In [1]:
import json
import numpy as np
import pandas as pd

In [13]:
df = pd.read_csv('../output/primary_data_preprocessed.csv')
df = df.set_index(['country', 'primary_index', 'decile'])

In [14]:
def weighted_geometric_mean(data: list[tuple[pd.Series | pd.DataFrame, float]]):
    # computed in log space for numerical stability
    wsum = 0
    logsum = 0
    for values, weight in data:
        if weight != 0:
            logsum += weight * np.log(values)
            wsum += weight
    
    return np.exp(logsum / wsum)

In [15]:
with open('../data/ewbi_indicators.json') as f:
    config = json.load(f)['EWBI']

In [16]:
all_codes = set()
all_secondaries = set()
for prio in config:
    print(prio['name'])
    for component in prio['components']:
        print('\t', component['name'])
        print(f'\t\t {component['name']}: {component['weight']}')
        all_secondaries.add(component['name'])
        for ind in component['indicators']:
            if ind['code']:
                print(f'\t\t\t {ind['code']}: {ind['weight']}')
                all_codes.add(ind['code'])
    print()

Agriculture and Food
	 Nutrition need
		 Nutrition need: 2/3
			 AN-SILC-1: 1
			 AN-EHIS-1: 2
	 Nutrition expense
		 Nutrition expense: 1/3
			 AE-HBS-1: 1
			 AE-HBS-2: 1
			 AE-EHIS-1: 2

Energy and Housing
	 Housing quality
		 Housing quality: 1/3
			 HQ-SILC-1: 2
			 HQ-SILC-2: 2
	 Energy
		 Energy: 1/3
			 HE-SILC-1: 2
			 HE-SILC-2: 1
			 HE-HBS-1: 1
			 HE-HBS-2: 1
	 Housing expense
		 Housing expense: 1/3
			 HH-SILC-1: 1
			 HH-HBS-1: 1
			 HH-HBS-2: 1

Equality
	 Life satisfaction
		 Life satisfaction: 1/4
			 EL-SILC-1: 2
			 EL-EHIS-1: 2
	 Security
		 Security: 1/4
			 ES-SILC-1: 1
			 ES-SILC-2: 1
	 Community
		 Community: 1/4
			 EC-SILC-1: 2
			 EC-SILC-2: 2
			 EC-HBS-1: 1
			 EC-HBS-2: 1
			 EC-EHIS-1: 2
	 Digital Skills
		 Digital Skills: 1/4
			 ED-EHIS-1: 2

Health and Animal Welfare
	 Health condition and impact
		 Health condition and impact: 1/2
			 AH-SILC-1: 2
			 AH-SILC-2: 2
			 AH-SILC-3: 2
			 AH-SILC-4: 2
			 AH-EHIS-1: 2
			 AH-EHIS-2: 2
	 Health cost an

In [17]:
print("Present in json file but not in index:", all_codes.difference(df.index.get_level_values('primary_index')))
print("Present in index but not in json file:", df.index.get_level_values('primary_index').difference(all_codes))

Present in json file but not in index: {'IS-SILC-2', 'IS-SILC-1', 'RU-LFS-1'}
Present in index but not in json file: Index([], dtype='object', name='primary_index')


In [None]:
secondary = {}
missing = {}
# separate countries as indicators aren't all available for all countries
for country, cdf in df.groupby('country'):
    cdf = cdf.loc[country]
    for prio in config:
        for component in prio['components']:
            factors = []
            for ind in component['indicators']:
                code = ind['code']
                weight = ind['weight']
                if code in cdf.index and weight != 0:
                    factors.append((cdf.loc[code], weight))
                elif code not in {'IS-SILC-2', 'IS-SILC-1', 'RU-LFS-1'}:
                   print(f"{country},{code}")
            if factors:
                secondary[country, prio['name'], component['name']] = weighted_geometric_mean(factors)
                #secondary[country, prio['name'], component['name']] = np.exp(logsum / wsum)
            else:
                #print('Missing', country, component['name'])
                pass

secondary = pd.concat(secondary, names=('country', 'eu_priority', 'secondary_indicator'))

BE,EL-EHIS-1
BE,AH-EHIS-1
BE,AH-EHIS-2
BE,AC-EHIS-1
CH,AN-EHIS-1
CH,AE-HBS-1
CH,AE-HBS-2
CH,AE-EHIS-1
CH,HE-HBS-1
CH,HE-HBS-2
CH,HH-HBS-1
CH,HH-HBS-2
CH,EL-EHIS-1
CH,EC-HBS-1
CH,EC-HBS-2
CH,EC-EHIS-1
CH,ED-EHIS-1
CH,AH-EHIS-1
CH,AH-EHIS-2
CH,AC-HBS-1
CH,AC-HBS-2
CH,AC-EHIS-1
CH,AB-EHIS-1
CH,AB-EHIS-2
CH,AB-EHIS-3
CH,IE-HBS-1
CH,IE-HBS-2
CH,IC-HBS-1
CH,IC-HBS-2
CH,TT-HBS-1
CH,TT-HBS-2
CH,TS-HBS-1
CH,TS-HBS-2
CZ,EC-SILC-2
CZ,IS-SILC-3
CZ,RT-SILC-1
CZ,RT-SILC-2
EA,AN-SILC-1
EA,AN-EHIS-1
EA,AE-HBS-1
EA,AE-HBS-2
EA,AE-EHIS-1
EA,HQ-SILC-1
EA,HQ-SILC-2
EA,HE-SILC-1
EA,HE-SILC-2
EA,HE-HBS-1
EA,HE-HBS-2
EA,HH-SILC-1
EA,HH-HBS-1
EA,HH-HBS-2
EA,EL-SILC-1
EA,EL-EHIS-1
EA,ES-SILC-1
EA,ES-SILC-2
EA,EC-SILC-1
EA,EC-SILC-2
EA,EC-HBS-1
EA,EC-HBS-2
EA,EC-EHIS-1
EA,ED-EHIS-1
EA,AH-SILC-1
EA,AH-SILC-2
EA,AH-SILC-3
EA,AH-SILC-4
EA,AH-EHIS-1
EA,AH-EHIS-2
EA,AC-SILC-1
EA,AC-SILC-2
EA,AC-HBS-1
EA,AC-HBS-2
EA,AC-EHIS-1
EA,AB-EHIS-1
EA,AB-EHIS-2
EA,AB-EHIS-3
EA,IS-SILC-3
EA,IE-HBS-1
EA,IE-HBS-2
EA,IC-SILC-1
EA,

In [28]:
secondary.to_csv('../output/secondary_indicators.csv')

In [29]:
print(all_secondaries.difference(secondary.index.get_level_values('secondary_indicator')))
print(secondary.index.get_level_values('secondary_indicator').difference(all_secondaries))

set()
Index([], dtype='object', name='secondary_indicator')


In [31]:
priorities = {}
for country, cdf in secondary.groupby('country'):
    cdf = cdf.loc[country]
    for prio in config:
        pname = prio['name']
        if pname in cdf.index:
            cpdf = cdf.loc[pname]
            factors = []
            for c in prio['components']:
                name = c['name']
                weight = c['weight']
                try:
                    weight = float(weight)
                except ValueError:
                    numerator, denominator = map(int, weight.split('/'))
                    weight = float(numerator) / denominator

                if name in cpdf.index and weight != 0:
                    factors.append((cpdf.loc[name], weight))

            if factors:
                priorities[country, pname] = weighted_geometric_mean(factors)
            else:
                print('Missing', country, pname)                

priorities = pd.concat(priorities, names=['country', 'eu_priority'])

In [32]:
priorities.to_csv('../output/eu_priorities.csv')

In [33]:
ewbi = {}
for country, cdf in priorities.groupby('country'):
    cdf = cdf.loc[country]
    factors = [(cdf.loc[prio], 1)  for prio in cdf.index.get_level_values('eu_priority')]
    ewbi[country] = weighted_geometric_mean(factors)
    
ewbi = pd.concat(ewbi, names=['country'])

In [34]:
ewbi.to_csv('../output/ewbi_results.csv')