## Computation of EWBI and wellbeing sub-indicators

In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('../data/2025-06-05_df_final_EWBI.csv')
df

Unnamed: 0,year,country,decile,primary_index,value,database,quintile
0,2013.0,AT,1.0,EL-SILC-1,0.0645733994468491,EU-SILC,
1,2013.0,AT,2.0,EL-SILC-1,0.0607012715109848,EU-SILC,
2,2013.0,AT,3.0,EL-SILC-1,0.0472792972731356,EU-SILC,
3,2013.0,AT,4.0,EL-SILC-1,0.0399880953587883,EU-SILC,
4,2013.0,AT,5.0,EL-SILC-1,0.0313219193700967,EU-SILC,
...,...,...,...,...,...,...,...
128910,2014.0,RS,,TT-SILC-1,0.002,EU-SILC,5.0
128911,2014.0,SE,,TT-SILC-1,0.006,EU-SILC,5.0
128912,2014.0,SI,,TT-SILC-1,0.003,EU-SILC,5.0
128913,2014.0,SK,,TT-SILC-1,0.003,EU-SILC,5.0


## Preprocessing
### Data cleaning

In [7]:
df = df.drop(columns=['database'])

In [8]:
df['value'] = df['value'].str.replace(',', '.') # some commas appear as decile separators
df['value'] = df['value'].astype(float)

In [9]:
df['year'] = df['year'].astype(int)
df.year.unique()

array([2013, 2021, 2022, 2023, 2009, 2014, 2015, 2016, 2017, 2018, 2019,
       2020, 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2012])

### Splitting quintiles into 2 deciles

In [10]:
def process_quantiles(df):
    """
    Port data in quintiles to deciles by assigning duplicating each row with quintile 
    and assigning it to the two corresponding deciles.
    """
    print("Initial length:", len(df))
    quintile_rows = df[df.quintile.notna()].copy()
    print("Number of rows with quintile:", len(quintile_rows))
    quintile_rows['decile'] = quintile_rows['quintile'] * 2
    quintile_rows_duplicated = quintile_rows.copy()
    quintile_rows_duplicated['decile'] = quintile_rows_duplicated['quintile'] * 2 - 1
    df = pd.concat([df[df.quintile.isna()], quintile_rows, quintile_rows_duplicated], ignore_index=True)
    print("Final length:", len(df))
    df['decile'] = df['decile'].astype(int)
    df = df.drop(columns=['quintile'])
    return df

df = process_quantiles(df)

Initial length: 128915
Number of rows with quintile: 2615
Final length: 131530


### Fill missing values
The EU JRC methodology tells us to fill missing values (NaNs) for each indicator using the next last available one, and if absent the next available one. This is preferred to ignoring indicators for the years they're not available.

In [11]:
wide = df.pivot_table(values='value', index=['primary_index', 'decile', 'country'], columns='year')
wide

Unnamed: 0_level_0,Unnamed: 1_level_0,year,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
primary_index,decile,country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AB-EHIS-1,1,AT,,,,,,,,,,0.280373,0.244449,0.274157,,,,,,,,
AB-EHIS-1,1,BE,,,,,,,,,,0.260522,,,,,,,,,,
AB-EHIS-1,1,BG,,,,,,,,,,,0.303731,0.000000,,,,,,,,
AB-EHIS-1,1,CY,,,,,,,,,,,0.267094,,,,,,,,,
AB-EHIS-1,1,CZ,,,,,,,,,,,0.245606,0.341152,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TT-SILC-2,10,RO,,,,,,,,,0.187,,,,,,,,,,,
TT-SILC-2,10,SE,,,,,,,,,0.475,,,,,,,,,,,
TT-SILC-2,10,SI,,,,,,,,,0.303,,,,,,,,,,,
TT-SILC-2,10,SK,,,,,,,,,0.395,,,,,,,,,,,


In [12]:
filled = wide.ffill(axis=1).bfill(axis=1)
filled

Unnamed: 0_level_0,Unnamed: 1_level_0,year,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
primary_index,decile,country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AB-EHIS-1,1,AT,0.280373,0.280373,0.280373,0.280373,0.280373,0.280373,0.280373,0.280373,0.280373,0.280373,0.244449,0.274157,0.274157,0.274157,0.274157,0.274157,0.274157,0.274157,0.274157,0.274157
AB-EHIS-1,1,BE,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522,0.260522
AB-EHIS-1,1,BG,0.303731,0.303731,0.303731,0.303731,0.303731,0.303731,0.303731,0.303731,0.303731,0.303731,0.303731,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
AB-EHIS-1,1,CY,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094,0.267094
AB-EHIS-1,1,CZ,0.245606,0.245606,0.245606,0.245606,0.245606,0.245606,0.245606,0.245606,0.245606,0.245606,0.245606,0.341152,0.341152,0.341152,0.341152,0.341152,0.341152,0.341152,0.341152,0.341152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TT-SILC-2,10,RO,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000,0.187000
TT-SILC-2,10,SE,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000,0.475000
TT-SILC-2,10,SI,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000,0.303000
TT-SILC-2,10,SK,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000,0.395000


### Normalising

In [17]:
# The normalisation is intra-decile and intra-indicator so we separate using groupby
res = []
for (ind, decile), grouped in filled.groupby(['primary_index', 'decile']):
    data = grouped.copy()

    # normalize the data over countries, so that the best-performing coutry has value 1 and the worst 0
    # values are negative in the sense that the best-performing country is the one with the lowest initial value and vice-versa
    norm = 1 - (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))

    # replace 0 values with 0.001 as well as all values in between
    norm[norm < 0.001] = 0.001
    res.append(norm)

preprocessed = pd.concat(res)
preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,year,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
primary_index,decile,country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AB-EHIS-1,1,AT,0.175269,0.175269,0.175269,0.175269,0.175269,0.175269,0.175269,0.175269,0.175269,0.175269,0.324087,0.364936,0.364936,0.364936,0.364936,0.364936,0.364936,0.364936,0.364936,0.364936
AB-EHIS-1,1,BE,0.257505,0.257505,0.257505,0.257505,0.257505,0.257505,0.257505,0.257505,0.257505,0.257505,0.257505,0.396521,0.396521,0.396521,0.396521,0.396521,0.396521,0.396521,0.396521,0.396521
AB-EHIS-1,1,BG,0.078512,0.078512,0.078512,0.078512,0.078512,0.078512,0.078512,0.078512,0.078512,0.078512,0.078512,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
AB-EHIS-1,1,CY,0.230281,0.230281,0.230281,0.230281,0.230281,0.230281,0.230281,0.230281,0.230281,0.230281,0.230281,0.381297,0.381297,0.381297,0.381297,0.381297,0.381297,0.381297,0.381297,0.381297
AB-EHIS-1,1,CZ,0.319294,0.319294,0.319294,0.319294,0.319294,0.319294,0.319294,0.319294,0.319294,0.319294,0.319294,0.209746,0.209746,0.209746,0.209746,0.209746,0.209746,0.209746,0.209746,0.209746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TT-SILC-2,10,RO,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
TT-SILC-2,10,SE,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216,0.202216
TT-SILC-2,10,SI,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670,0.678670
TT-SILC-2,10,SK,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823,0.423823


In [18]:
preprocessed.swaplevel(1, 2).sort_index().to_csv('../output/primary_data_preprocessed.csv')