In [47]:
import pandas as pd
import numpy as np



In [48]:
def _split_column(df_: pd.DataFrame) -> pd.DataFrame:
    """splits a column into several, based on a comma separating diff values"""
    df = df_.copy()
    new_cols = df['unit,sex,age,geo\\time'].str.split(',', expand=True)
    new_cols.columns = ['unit', 'sex', 'age', 'region']
    df = pd.concat([df, new_cols], axis=1)
    df = df.drop(['unit,sex,age,geo\\time'], axis=1)
    return df

In [49]:
def clean_data(df_: pd.DataFrame, region: str = "PT") -> None:
    """ receives a dataframe and do some cleaning"""

    df = df_.copy()
    df = _split_column(df)
    df = df.melt(id_vars=["unit", "sex", "age", "region"], var_name="year", value_name="value")
    df = extract_numeric_values_from_column(df, "value")
    df = df.dropna(subset=["value"])
    df["year"] = df["year"].astype(int)
    df["value"] = df["value"].astype(float)
    df = df[df.region == region]
    return df

In [50]:
def extract_numeric_values_from_column(df_: pd.DataFrame, column: str) -> pd.DataFrame:
    """extracts numerical values from a string including several strange characteres"""
    df = df_.copy()
    df[column]= df[column].replace(to_replace=r'[^\d\.]+', value='', regex=True)
    df[column] = df[column].replace(to_replace='', value=np.nan)
    df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

In [51]:
FILE_DIR = "/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/data"

In [52]:
from pathlib import Path
import pandas as pd

FILE_DIR = Path("/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/data")
df = pd.read_csv(FILE_DIR.joinpath("eu_life_expectancy_raw.tsv"), sep='\t')
df_cleaned = clean_data(df, region="PT")


In [53]:
df_cleaned

Unnamed: 0,unit,sex,age,region,year,value
3460,YR,F,Y65,PT,2021,21.7
4804,YR,F,Y_LT1,PT,2021,84.3
8276,YR,M,Y65,PT,2021,17.8
9620,YR,M,Y_LT1,PT,2021,78.0
13092,YR,T,Y65,PT,2021,19.9
...,...,...,...,...,...,...
895540,YR,T,Y83,PT,1960,4.7
895596,YR,T,Y84,PT,1960,4.5
895652,YR,T,Y9,PT,1960,62.9
895708,YR,T,Y_GE85,PT,1960,4.2
