### Importing necessary libraries:

In [2]:
import pandas as pd 
import os 
import pymc


### Data preprocessing

In [56]:
# Data path
data_path = os.path.join(os.getcwd(), '..', 'Data')

# Load the data
life_expectancy = pd.read_csv(os.path.join(data_path, 'estat_demo_mlexpec_en.csv'))
material_deprivation_rate = pd.read_csv(os.path.join(data_path, 'estat_ilc_chmd03_en.csv'))
median_and_mean_group_income = pd.read_csv(os.path.join(data_path, 'estat_ilc_di15_filtered_en.csv'))
low_work_intensity_households = pd.read_csv(os.path.join(data_path, 'estat_ilc_lvhl16n_en.csv'))
real_gdp = pd.read_csv(os.path.join(data_path, 'estat_tipsna40_en.csv'))
population_data = pd.read_csv(os.path.join(data_path, 'estat_demo_pjanbroad_filtered_en.csv'))

# Dataframes dictionary 
dataframes_dict = {
    'life_expectancy': life_expectancy,
    'material_deprivation_rate': material_deprivation_rate,
    'median_and_mean_group_income': median_and_mean_group_income,
    'low_work_intensity_households': low_work_intensity_households,
    'real_gdp': real_gdp,
    'population_data': population_data
}

# Include only EU data and filter data older than 2003, (because gdp data is not older than 2003). Remove also NaN time period
eu_countries = ['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czechia', 'Germany',
       'Denmark','Estonia',
       'Greece', 'Spain', 'Finland', 'France', 'Croatia', 'Hungary', 'Ireland', 'Italy',
       'Lithuania', 'Luxembourg', 'Latvia', 'Malta', 'Netherlands',
       'Poland', 'Portugal', 'Romania', 'Sweden', 'Slovenia', 'Slovakia']

for key, df in dataframes_dict.items():
    dataframes_dict[key] = df[df['Geopolitical entity (reporting)'].isin(eu_countries)].reset_index(drop=True)
for key, df in dataframes_dict.items():
    dataframes_dict[key] = df.loc[(df['TIME_PERIOD'] >= 2003) & (df['TIME_PERIOD'] < 2024)].reset_index(drop=True)


# Filter life expectancies for newborns
dataframes_dict['life_expectancy'] = dataframes_dict['life_expectancy'].loc[dataframes_dict['life_expectancy']['age'] == 'Y_LT1'].reset_index(drop=True)

# Generalize mean and median data
# Filter age groups present in the data
age_groups = ['From 18 to 64 years', '65 years or over']
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'][dataframes_dict['median_and_mean_group_income']['Age class'].isin(age_groups)].reset_index(drop=True)

# Drop NaN's
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].dropna(subset=['Age class']).reset_index(drop=True)

# Filter PPS (Purchasing Power Standard) currency 
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].loc[dataframes_dict['median_and_mean_group_income']['unit'] == 'PPS'].reset_index(drop=True)

# Remove unnecessary columns
for key, df in dataframes_dict.items():
    dataframes_dict[key] = df.drop(columns=['STRUCTURE', 'STRUCTURE_NAME', 'STRUCTURE_ID', 'Confidentiality status (flag)', 'CONF_STATUS', 'Observation status (Flag) V2 structure', 'unit', 'Observation value', 'Time', 'geo', 'age', 'Sex',
                                            'Country of citizenship', 'citizen', 'Unit of measure', 'Income and living conditions indicator', 'freq', 'Time frequency', 'OBS_FLAG'], errors='ignore')


# Because the income date is available only by population, we need to merge it with population data to be able to combine the date for the whole population
dataframes_dict['population_data'].rename(columns={'OBS_VALUE': 'population'}, inplace=True)
dataframes_dict['population_data'].replace({'Age class': {'From 15 to 64 years': 'From 18 to 64 years'}}, inplace=True)
dataframes_dict['population_data'] = dataframes_dict['population_data'].loc[dataframes_dict['population_data']['Age class'] != 'Less than 15 years'].reset_index(drop=True)


dataframes_dict['median_and_mean_group_income'] = pd.merge(dataframes_dict['median_and_mean_group_income'],
                  dataframes_dict['population_data'],
                  on=['sex','Geopolitical entity (reporting)', 'TIME_PERIOD', 'Age class'],
                  how='left') 


# We group the data to calculate the weighted average of OBS_VALUE by population for each group defined
group_cols = ['sex', 'Geopolitical entity (reporting)', 'TIME_PERIOD', 'indic_il']

def weighted_average(group_df):
    weighted_sum = (group_df['OBS_VALUE'] * group_df['population']).sum()
    total_population = group_df['population'].sum()
    
    if total_population > 0:
        weighted_avg = weighted_sum / total_population
    else:
        weighted_avg = 0 
    
    return pd.Series({
        'OBS_VALUE': weighted_avg
    })

# Transform median and mean into seperate columns
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].groupby(group_cols).apply(weighted_average).reset_index()
dataframes_dict['life_expectancy'] = dataframes_dict['life_expectancy'].drop(columns='Age class')
dataframes_dict['median_and_mean_group_income']['indic_il']
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].pivot(index=[col for col in dataframes_dict['median_and_mean_group_income'].columns if col not in ['indic_il', 'OBS_VALUE']], columns='indic_il', values='OBS_VALUE').reset_index()




# pd.merge(dataframes_dict['median_and_mean_group_income'], dataframes_dict['life_expectancy'], on=['sex', 'TIME_PERIOD', 'Geopolitical entity (reporting)'])

     sex Geopolitical entity (reporting)  TIME_PERIOD  OBS_VALUE
0      F                         Austria         2003       81.5
1      F                         Austria         2004       82.1
2      F                         Austria         2005       82.2
3      F                         Austria         2006       82.8
4      F                         Austria         2007       83.1
...   ..                             ...          ...        ...
1693   T                        Slovakia         2019       77.8
1694   T                        Slovakia         2020       77.0
1695   T                        Slovakia         2021       74.6
1696   T                        Slovakia         2022       77.0
1697   T                        Slovakia         2023       78.2

[1698 rows x 4 columns]


  dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].groupby(group_cols).apply(weighted_average).reset_index()


Unnamed: 0,sex,Geopolitical entity (reporting),TIME_PERIOD,MED_E,MEI_E,OBS_VALUE
0,F,Austria,2003,15342.338427,17109.022013,81.5
1,F,Austria,2004,16622.214843,18347.959873,82.1
2,F,Austria,2005,17601.980989,19614.340744,82.2
3,F,Austria,2006,17802.543685,19607.809630,82.8
4,F,Austria,2007,18416.722715,20309.654523,83.1
...,...,...,...,...,...,...
1564,T,Sweden,2019,21749.500013,23594.508859,83.2
1565,T,Sweden,2020,21291.441958,22935.643461,82.4
1566,T,Sweden,2021,21823.628519,23617.433935,83.1
1567,T,Sweden,2022,21853.884176,23521.927826,83.1


Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,freq,Time frequency,unit,Unit of measure,indic_il,Income and living conditions indicator,citizen,...,geo,Geopolitical entity (reporting),TIME_PERIOD,Time,OBS_VALUE,Observation value,OBS_FLAG,Observation status (Flag) V2 structure,CONF_STATUS,Confidentiality status (flag)


In [42]:
median_and_mean_group_income[(median_and_mean_group_income['sex'] == 'F') and (median_and_mean_group_income['Geopolitical entity (reporting)'] == 'Bulgaria') and (median_and_mean_group_income['TIME_PERIOD'] == 2003)]
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B', 'A', 'B'],
    'Values': [1, 2, 3, 4, 5, 6]
})

# Function to compute range
def range_func(group):
    print("Data in Group:")
    print(group)  # this line prints the values in the current group being processed
    return group.max() - group.min()

# Group by 'Category' and apply custom aggregation
result = df.groupby('Category')['Values'].agg(range_func)
print("\nAggregation Result:")
print(result)
dataframes_dict['median_and_mean_group_income']

Data in Group:
0    1
1    2
4    5
Name: Values, dtype: int64
Data in Group:
2    3
3    4
5    6
Name: Values, dtype: int64

Aggregation Result:
Category
A    4
B    3
Name: Values, dtype: int64


indic_il,sex,Geopolitical entity (reporting),TIME_PERIOD,row,MED_E,MEI_E
0,F,Austria,2003,0,15342.338427,17109.022013
1,F,Austria,2004,1,16622.214843,18347.959873
2,F,Austria,2005,2,17601.980989,19614.340744
3,F,Austria,2006,3,17802.543685,19607.809630
4,F,Austria,2007,4,18416.722715,20309.654523
...,...,...,...,...,...,...
1567,T,Sweden,2019,1567,21749.500013,23594.508859
1568,T,Sweden,2020,1568,21291.441958,22935.643461
1569,T,Sweden,2021,1569,21823.628519,23617.433935
1570,T,Sweden,2022,1570,21853.884176,23521.927826


In [6]:
dataframes_dict['life_expectancy'].head()
dataframes_dict['low_work_intensity_households'].head()
dataframes_dict['material_deprivation_rate'].head()
dataframes_dict['median_and_mean_group_income'].head()
dataframes_dict['population_data'].head()
# dataframes_dict['real_gdp'].head()
dataframes_dict['population_data']['TIME_PERIOD'].unique()
dataframes_dict['median_and_mean_group_income']['unit'].unique()

array(['PPS'], dtype=object)

In [11]:
df = pd.DataFrame({
    'Department': ['Sales', 'Sales', 'HR', 'HR', 'IT'],
    'Salary': [50000, 60000, 45000, 48000, 70000]
})

# Group by department and calculate average salary
grouped = df.groupby('Department')['Salary']
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x175a0aa50>

In [30]:
median_and_mean_group_income['indic_il'].value_counts()

indic_il
MED_E    75603
MEI_E    75603
Name: count, dtype: int64