In [1]:
import pandas as pd

# Data cleaning

In [2]:
data_folder = '../data/'
age_path = data_folder + 'age/age_nuts3.csv'
employment_path = data_folder + 'employment_rate/employment_rate_nuts2.csv'

## Age dataset

In [3]:
# load dataset
df_age = pd.read_csv(age_path)
# drop useless columns
df_age.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'freq', 'sex', 'unit', 'OBS_FLAG'], inplace=True)
df_age.head()

Unnamed: 0,age,geo,TIME_PERIOD,OBS_VALUE
0,UNK,AL,2021,0
1,UNK,AL,2022,0
2,UNK,AL0,2021,0
3,UNK,AL01,2021,0
4,UNK,AL011,2021,0


In [4]:
# filter by time
year = 2022
def filter_year(df, year):
    df = df[df['TIME_PERIOD']==year]
    df = df.drop(columns='TIME_PERIOD')
    return df

df_age = filter_year(df_age, year)

In [5]:
# filter by max precision (NUTS 3)
def filter_precision(df):
    max_precision = df['geo'].apply(len).max()
    df = df[df['geo'].apply(len)==max_precision]
    return df

df_age = filter_precision(df_age)

In [6]:
# unknown age is always 0, so it can be dropped
print(df_age.loc[df_age['age']=='UNK', 'OBS_VALUE'].unique())
df_age = df_age[df_age['age']!='UNK']

[0]


In [8]:
df_age.rename(columns={'geo':'nuts3', 'OBS_VALUE':'count'}, inplace=True)

In [9]:
df_age['age'].unique()

array(['Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39',
       'Y40-44', 'Y45-49', 'Y5-9', 'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69',
       'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y_GE85', 'Y_GE90',
       'Y_LT5'], dtype=object)

In [12]:
# regroup age ranges
under_20 = ['Y_LT5', 'Y5-9', 'Y10-14', 'Y15-19']
between_20_64 = ['Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49', 'Y50-54', 'Y55-59', 'Y60-64']
over_64 = ['Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y_GE90']

In [26]:
df_under_20 = df_age[df_age['age'].isin(under_20)].groupby('nuts3').sum().reset_index()
df_under_20.rename(columns={'count':'under_20'}, inplace=True)

df_between_20_64 = df_age[df_age['age'].isin(between_20_64)].groupby('nuts3').sum().reset_index()
df_between_20_64.rename(columns={'count':'between_20_64'}, inplace=True)

df_over_64 = df_age[df_age['age'].isin(over_64)].groupby('nuts3').sum().reset_index()
df_over_64.rename(columns={'count':'over_64'}, inplace=True)

In [31]:
df_age_export = pd.merge(df_under_20, df_between_20_64, on='nuts3')
df_age_export = pd.merge(df_age_export, df_over_64, on='nuts3')
df_age_export.head()

Unnamed: 0,nuts3,under_20,between_20_64,over_64
0,AL011,29897,62959,16729
1,AL012,66842,178378,46105
2,AL013,19868,40965,11935
3,AL014,28043,69084,20156
4,AL015,44214,119305,29490


In [32]:
# export clean table to csv
df_age_export.to_csv(data_folder+f'age/age_nuts3_{year}_clean.csv', index=False)

## Employment rate dataset

In [33]:
# load dataset
df_employment = pd.read_csv(employment_path)
# drop useless columns
df_employment.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'freq', 'sex', 'unit', 'OBS_FLAG'], inplace=True)
df_employment.head()

Unnamed: 0,age,geo,TIME_PERIOD,OBS_VALUE
0,Y20-64,AT,2021,75.6
1,Y20-64,AT,2022,77.3
2,Y20-64,AT1,2021,73.2
3,Y20-64,AT1,2022,74.6
4,Y20-64,AT11,2021,74.9


In [34]:
# filter by time
df_employment = filter_year(df_employment, year)

# filter by max precision (NUTS 2)
df_employment = filter_precision(df_employment)

In [35]:
df_employment.rename(columns={'geo':'nuts2', 'OBS_VALUE':'employment_rate'}, inplace=True)

In [42]:
df_between_20_64 = df_employment[df_employment['age']=='Y20-64'].groupby('nuts2').sum().reset_index()
df_between_20_64.rename(columns={'employment_rate':'employment_rate_between_20_64'}, inplace=True)

df_over_64 = df_employment[df_employment['age']=='Y_GE65'].groupby('nuts2').sum().reset_index()
df_over_64.rename(columns={'employment_rate':'employment_rate_over_64'}, inplace=True)

In [43]:
df_employment_export = pd.merge(df_between_20_64, df_over_64, on='nuts2')
df_employment_export.head()

Unnamed: 0,nuts2,employment_rate_between_20_64,employment_rate_over_64
0,AT11,76.5,0.0
1,AT12,78.5,4.2
2,AT13,71.2,5.7
3,AT21,75.8,4.3
4,AT22,77.6,5.1


In [44]:
# export clean table to csv
df_employment_export.to_csv(data_folder+f'employment_rate/employment_rate_nuts2_{year}_clean.csv', index=False)