In [1]:
import pandas as pd

# Data cleaning

In [2]:
data_folder = 'data/'
age_path = data_folder + 'age/age_nuts3.csv'
employment_path = data_folder + 'employment_rate/employment_rate_nuts2.csv'

## Age dataset

In [6]:
# load dataset
df_age = pd.read_csv(age_path)
# drop useless columns
df_age.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'freq', 'sex', 'unit', 'OBS_FLAG'], inplace=True)
df_age.head()

Unnamed: 0,age,geo,TIME_PERIOD,OBS_VALUE
0,UNK,AL,2021,0
1,UNK,AL,2022,0
2,UNK,AL0,2021,0
3,UNK,AL01,2021,0
4,UNK,AL011,2021,0


In [7]:
# filter by time
year = 2022
def filter_year(df, year):
    df = df[df['TIME_PERIOD']==year]
    df = df.drop(columns='TIME_PERIOD')
    return df

df_age = filter_year(df_age, year)

In [8]:
# filter by max precision (NUTS 3)
def filter_precision(df):
    max_precision = df['geo'].apply(len).max()
    df = df[df['geo'].apply(len)==max_precision]
    return df

df_age = filter_precision(df_age)

In [9]:
# unknown age is always 0, so it can be dropped
print(df_age.loc[df_age['age']=='UNK', 'OBS_VALUE'].unique())
df_age = df_age[df_age['age']!='UNK']

[0]


In [10]:
# export clean table to csv
df_age.rename(columns={'geo':'nuts3', 'OBS_VALUE':'count'}, inplace=True)
df_age.to_csv(data_folder+f'age/age_nuts3_{year}_clean.csv', index=False)
df_age.head()

Unnamed: 0,age,nuts3,count
3377,Y10-14,AL011,7168
3379,Y10-14,AL012,16203
3381,Y10-14,AL013,4939
3383,Y10-14,AL014,6798
3385,Y10-14,AL015,11474


## Employment rate dataset

In [11]:
# load dataset
df_employment = pd.read_csv(employment_path)
# drop useless columns
df_employment.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'freq', 'sex', 'unit', 'OBS_FLAG'], inplace=True)
df_employment.head()

Unnamed: 0,age,geo,TIME_PERIOD,OBS_VALUE
0,Y20-64,AT,2021,75.6
1,Y20-64,AT,2022,77.3
2,Y20-64,AT1,2021,73.2
3,Y20-64,AT1,2022,74.6
4,Y20-64,AT11,2021,74.9


In [13]:
# filter by time
df_employment = filter_year(df_employment, year)

# filter by max precision (NUTS 2)
df_employment = filter_precision(df_employment)

In [15]:
# export clean table to csv
df_employment.rename(columns={'geo':'nuts2', 'OBS_VALUE':'employment_rate'}, inplace=True)
df_employment.to_csv(data_folder+f'employment_rate/employment_rate_nuts2_{year}_clean.csv', index=False)
df_employment.head()

Unnamed: 0,age,nuts2,employment_rate
5,Y20-64,AT11,76.5
7,Y20-64,AT12,78.5
9,Y20-64,AT13,71.2
13,Y20-64,AT21,75.8
15,Y20-64,AT22,77.6
