### Importing necessary libraries:

In [4]:
import pandas as pd 
import os 
import pymc


### Data preprocessing

In [None]:
# Data path
data_path = os.path.join(os.getcwd(), '..', 'Data')

# Load the data
life_expectancy = pd.read_csv(os.path.join(data_path, 'estat_demo_mlexpec_en.csv'))
material_deprivation_rate = pd.read_csv(os.path.join(data_path, 'estat_ilc_chmd03_en.csv'))
median_and_mean_group_income = pd.read_csv(os.path.join(data_path, 'estat_ilc_di15_filtered_en.csv'))
low_work_intensity_households = pd.read_csv(os.path.join(data_path, 'estat_ilc_lvhl16n_en.csv'))
real_gdp = pd.read_csv(os.path.join(data_path, 'estat_tipsna40_en.csv'))
population_data = pd.read_csv(os.path.join(data_path, 'estat_demo_pjanbroad_filtered_en.csv'))

# Dataframes dictionary 
dataframes_dict = {
    'life_expectancy': life_expectancy,
    'material_deprivation_rate': material_deprivation_rate,
    'median_and_mean_group_income': median_and_mean_group_income,
    'low_work_intensity_households': low_work_intensity_households,
    'real_gdp': real_gdp,
    'population_data': population_data
}

# Include only EU data and filter data older than 2003, (because gdp data is not older than 2003). Remove also NaN time period
eu_countries = ['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czechia', 'Germany',
       'Denmark','Estonia',
       'Greece', 'Spain', 'Finland', 'France', 'Croatia', 'Hungary', 'Ireland', 'Italy',
       'Lithuania', 'Luxembourg', 'Latvia', 'Malta', 'Netherlands',
       'Poland', 'Portugal', 'Romania', 'Sweden', 'Slovenia', 'Slovakia']

for key, df in dataframes_dict.items():
    dataframes_dict[key] = df[df['Geopolitical entity (reporting)'].isin(eu_countries)].reset_index(drop=True)
for key, df in dataframes_dict.items():
    dataframes_dict[key]['TIME_PERIOD'] = df['TIME_PERIOD'].loc[df['TIME_PERIOD'] >= 2003].reset_index(drop=True)
for key, df in dataframes_dict.items():
    dataframes_dict[key] = df.dropna(subset=['TIME_PERIOD']).reset_index(drop=True)

# Filter life expectancies for newborns
dataframes_dict['life_expectancy'] = dataframes_dict['life_expectancy'].loc[dataframes_dict['life_expectancy']['age'] == 'Y_LT1'].reset_index(drop=True)

# Generalize mean and median data
# Filter age groups present in the data
age_groups = ['From 18 to 64 years', '65 years or over']
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'][dataframes_dict['median_and_mean_group_income']['Age class'].isin(age_groups)].reset_index(drop=True)

# Drop NaN's
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].dropna(subset=['Age class']).reset_index(drop=True)

# Filter PPS (Purchasing Power Standard) currency 
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].loc[dataframes_dict['median_and_mean_group_income']['unit'] == 'PPS'].reset_index(drop=True)

# Remove unnecessary columns
for key, df in dataframes_dict.items():
    dataframes_dict[key] = df.drop(columns=['STRUCTURE', 'STRUCTURE_NAME', 'STRUCTURE_ID', 'Confidentiality status (flag)', 'CONF_STATUS', 'Observation status (Flag) V2 structure', 'unit', 'Observation value', 'Time', 'geo', 'age', 'Sex',
                                            'Country of citizenship', 'citizen', 'Unit of measure', 'Income and living conditions indicator', 'freq', 'Time frequency', 'OBS_FLAG'], errors='ignore')



dataframes_dict['population_data'].rename(columns={'OBS_VALUE': 'population'}, inplace=True)
dataframes_dict['population_data'].replace({'Age class': {'From 15 to 64 years': 'From 18 to 64 years'}}, inplace=True)
dataframes_dict['population_data'] = dataframes_dict['population_data'].loc[dataframes_dict['population_data']['Age class'] != 'Less than 15 years'].reset_index(drop=True)

print(dataframes_dict['population_data']['Geopolitical entity (reporting)'].unique())
print(dataframes_dict['population_data']['Age class'].unique())
print(dataframes_dict['median_and_mean_group_income']['Geopolitical entity (reporting)'].unique())
print(dataframes_dict['median_and_mean_group_income']['Age class'].unique())


dataframes_dict['median_and_mean_group_income'] = pd.merge(dataframes_dict['median_and_mean_group_income'],
                  dataframes_dict['population_data'],
                  on=['sex','Geopolitical entity (reporting)', 'TIME_PERIOD', 'Age class'],
                  how='left') 

# group_cols = ['sex', 'Geopolitical entity (reporting)', 'TIME_PERIOD', 'indic_il']

# # Custom function applied to DataFrame of each group
# def custom_func(group_df):
#     # Accessing multiple columns
#     max_diff = group_df['OBS_VALUE'].max() - group_df['OBS_VALUE'].min()
#     # Returning a Series
#     return pd.Series({
#         'STRUCTURE_NAME': group_df['STRUCTURE_NAME'].iloc[0],  # Taking first as an example
#         'Ranged_OBS_VALUE': max_diff  # Calculated range-related result
#     })

# # Applying the function with groupby and apply
# result = dataframes_dict['median_and_mean_group_income'].groupby(group_cols).apply(custom_func).reset_index()

# # result is your final DataFrame with aggregated 'OBS_VALUE' and single entries for other columns
# print(result)


['Austria' 'Belgium' 'Bulgaria' 'Cyprus' 'Czechia' 'Germany' 'Denmark'
 'Estonia' 'Greece' 'Spain' 'Finland' 'France' 'Croatia' 'Hungary'
 'Ireland' 'Italy' 'Lithuania' 'Luxembourg' 'Latvia' 'Malta' 'Netherlands'
 'Poland' 'Portugal' 'Romania' 'Sweden' 'Slovenia' 'Slovakia']
['From 18 to 64 years' '65 years or over']
['Austria' 'Belgium' 'Bulgaria' 'Cyprus' 'Czechia' 'Germany' 'Denmark'
 'Estonia' 'Greece' 'Spain' 'Finland' 'France' 'Croatia' 'Hungary'
 'Ireland' 'Italy' 'Lithuania' 'Luxembourg' 'Latvia' 'Malta' 'Netherlands'
 'Poland' 'Portugal' 'Romania' 'Sweden' 'Slovenia' 'Slovakia']
['From 18 to 64 years' '65 years or over']


In [11]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B', 'A', 'B'],
    'Values': [1, 2, 3, 4, 5, 6]
})

# Function to compute range
def range_func(group):
    print("Data in Group:")
    print(group)  # this line prints the values in the current group being processed
    return group.max() - group.min()

# Group by 'Category' and apply custom aggregation
result = df.groupby('Category')['Values'].agg(range_func)
print("\nAggregation Result:")
print(result)

Data in Group:
0    1
1    2
4    5
Name: Values, dtype: int64
Data in Group:
2    3
3    4
5    6
Name: Values, dtype: int64

Aggregation Result:
Category
A    4
B    3
Name: Values, dtype: int64


In [6]:
dataframes_dict['life_expectancy'].head()
dataframes_dict['low_work_intensity_households'].head()
dataframes_dict['material_deprivation_rate'].head()
dataframes_dict['median_and_mean_group_income'].head()
dataframes_dict['population_data'].head()
# dataframes_dict['real_gdp'].head()
dataframes_dict['population_data']['TIME_PERIOD'].unique()
dataframes_dict['median_and_mean_group_income']['unit'].unique()

array(['PPS'], dtype=object)

In [11]:
df = pd.DataFrame({
    'Department': ['Sales', 'Sales', 'HR', 'HR', 'IT'],
    'Salary': [50000, 60000, 45000, 48000, 70000]
})

# Group by department and calculate average salary
grouped = df.groupby('Department')['Salary']
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x175a0aa50>