### Importing necessary libraries:

In [2]:
import pandas as pd 
import os 
import pymc


### Data preprocessing

In [155]:
# Data path
data_path = os.path.join(os.getcwd(), '..', 'Data')

# Load the data
life_expectancy = pd.read_csv(os.path.join(data_path, 'estat_demo_mlexpec_en.csv'))
material_deprivation_rate = pd.read_csv(os.path.join(data_path, 'estat_ilc_chmd03_en.csv'))
median_and_mean_group_income = pd.read_csv(os.path.join(data_path, 'estat_ilc_di15_filtered_en.csv'))
low_work_intensity_households = pd.read_csv(os.path.join(data_path, 'estat_ilc_lvhl16n_en.csv'))
real_gdp = pd.read_csv(os.path.join(data_path, 'estat_tipsna40_en.csv'))
population_data = pd.read_csv(os.path.join(data_path, 'estat_demo_pjanbroad_filtered_en.csv'))

# Dataframes dictionary 
dataframes_dict = {
    'life_expectancy': life_expectancy,
    'material_deprivation_rate': material_deprivation_rate,
    'median_and_mean_group_income': median_and_mean_group_income,
    'low_work_intensity_households': low_work_intensity_households,
    'real_gdp': real_gdp,
    'population_data': population_data
}

# Include only EU data and filter data older than 2003, (because gdp data is not older than 2003). Remove also NaN time period
eu_countries = ['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czechia', 'Germany',
       'Denmark','Estonia',
       'Greece', 'Spain', 'Finland', 'France', 'Croatia', 'Hungary', 'Ireland', 'Italy',
       'Lithuania', 'Luxembourg', 'Latvia', 'Malta', 'Netherlands',
       'Poland', 'Portugal', 'Romania', 'Sweden', 'Slovenia', 'Slovakia']

for key, df in dataframes_dict.items():
    dataframes_dict[key] = df[df['Geopolitical entity (reporting)'].isin(eu_countries)].reset_index(drop=True)
    dataframes_dict[key]['TIME_PERIOD'] = df['TIME_PERIOD'].loc[df['TIME_PERIOD'] >= 2003].reset_index(drop=True)
    dataframes_dict[key] = df.dropna(subset=['TIME_PERIOD']).reset_index(drop=True)

# Filter life expectancies for newborns
dataframes_dict['life_expectancy'] = dataframes_dict['life_expectancy'].loc[dataframes_dict['life_expectancy']['age'] == 'Y_LT1'].reset_index(drop=True)

# Generalize mean and median data
# Filter age groups present in the data
age_groups = ['From 18 to 64 years', '65 years or over']
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'][dataframes_dict['median_and_mean_group_income']['Age class'].isin(age_groups)].reset_index(drop=True)

# Drop NaN's
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].dropna(subset=['Age class']).reset_index(drop=True)

# Filter PPS (Purchasing Power Standard) currency 
dataframes_dict['median_and_mean_group_income'] = dataframes_dict['median_and_mean_group_income'].loc[dataframes_dict['median_and_mean_group_income']['unit'] == 'PPS'].reset_index(drop=True)

# Remove unnecessary columns
for key, df in dataframes_dict.items():
    dataframes_dict[key] = df.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'Confidentiality status (flag)', 'CONF_STATUS', 'Observation status (Flag) V2 structure', 'Observation value', 'Time', 'geo', 'age', 'Sex',
                                            'Country of citizenship', 'citizen', 'Unit of measure', 'Income and living conditions indicator', 'freq'], errors='ignore')



# Group data by specified columns
grouped = dataframes_dict['median_and_mean_group_income'].groupby(['sex', 'Geopolitical entity (reporting)', 'TIME_PERIOD', 'indic_il'])
grouped

# # Now, you can perform operations like aggregation or summarization on the grouped data
# # For example, to see the mean of 'OBS_VALUE' for each group:
mean_values = grouped['Age class']
mean_values
# # If you want to see the size of each group:
# group_sizes = grouped.size()

# # To examine the grouped data:
# print(mean_values)
# print(group_sizes)

# # Final dataset 
# Load data from CSV file
# df = pd.read_csv('your_file.csv')  # replace 'your_file.csv' with the path to your CSV file

# Define the columns to group by
group_cols = ['sex', 'Geopolitical entity (reporting)', 'TIME_PERIOD', 'indic_il']

# Define the aggregation dictionary
# Assuming all other columns shall take the 'first' value from each group
agg_dict = {col: 'first' for col in dataframes_dict['median_and_mean_group_income'].columns if col not in group_cols}
agg_dict['OBS_VALUE'] = 'mean'  # specify that OBS_VALUE should be averaged

# Group data by specified columns and apply aggregation
result = df.groupby(group_cols).agg(agg_dict).reset_index()

# result is your final DataFrame with aggregated 'OBS_VALUE' and single entries for other columns
print(result)


KeyError: 'indic_il'

In [154]:
dataframes_dict['life_expectancy'].head()
dataframes_dict['low_work_intensity_households'].head()
dataframes_dict['material_deprivation_rate'].head()
dataframes_dict['median_and_mean_group_income'].head()
dataframes_dict['population_data'].head()
# dataframes_dict['real_gdp'].head()
dataframes_dict['population_data']['TIME_PERIOD'].unique()
dataframes_dict['median_and_mean_group_income']['unit'].unique()

array(['PPS'], dtype=object)

In [148]:
df = pd.DataFrame({
    'Department': ['Sales', 'Sales', 'HR', 'HR', 'IT'],
    'Salary': [50000, 60000, 45000, 48000, 70000]
})

# Group by department and calculate average salary
grouped = df.groupby('Department')['Salary']
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x29e1a8190>