In [18]:
import numpy as np
import pandas as pd
import argparse
import os
from pathlib import Path
from life_expectancy.loaders import load_data, save_data
from life_expectancy.cleaning import extract_numeric_values_from_column

In [26]:
# Determines the absolute path of the directory we are working on 
FILE_DIR = "/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy"

# Determines the relative path of the directory we are working on
data_dir = "/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/data"

# Determines the relative path of the directory we are working on
FIXTURE_dir = "/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/tests/fixtures"

In [3]:
def _split_column(df_: pd.DataFrame) -> pd.DataFrame:
    """splits a column into several, based on a comma separating diff values"""
    df = df_.copy()
    new_cols = df['unit,sex,age,geo\\time'].str.split(',', expand=True)
    new_cols.columns = ['unit', 'sex', 'age', 'region']
    df = pd.concat([df, new_cols], axis=1)
    df = df.drop(['unit,sex,age,geo\\time'], axis=1)
    return df

In [4]:
df = pd.read_csv("/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/tests/fixtures/eu_life_expectancy_raw.tsv", delimiter="\t")

In [5]:
df

Unnamed: 0,"unit,sex,age,geo\time",2021,2020,2019,2018,2017,2016,2015,2014,2013,...,1969,1968,1967,1966,1965,1964,1963,1962,1961,1960
0,"YR,F,Y1,AL",:,79.4,80.4,80.2,79.7,79.8,79.2,79.8,79.6,...,:,:,:,:,:,:,:,:,:,:
1,"YR,F,Y1,AM",:,:,79.1,79.2,78.5,78.0,77.9,:,:,...,:,:,:,:,:,:,:,:,:,:
2,"YR,F,Y1,AT",:,82.9,83.5,83.3,83.2,83.4,83.0,83.3,83.0,...,:,:,:,:,:,:,:,:,:,:
3,"YR,F,Y1,AZ",:,:,78.6,78.1,77.7,:,77.5,77.0,76.9,...,:,:,:,:,:,:,:,:,:,:
4,"YR,F,Y1,BE",:,82.3,83.6,83.2,83.2,83.2,82.6,83.1,82.5,...,74.4,74.3,74.7,74.4,74.3,74.6,73.9,74.0,74.5,73.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14443,"YR,T,Y_LT1,SM",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
14444,"YR,T,Y_LT1,TR",:,:,79.1,78.9,78.5,78.1,78.2,78.1,78.2,...,:,:,:,:,:,:,:,:,:,:
14445,"YR,T,Y_LT1,UA",:,:,73.4,73.2,73.3,72.9,72.5,71.8,:,...,:,:,:,:,:,:,:,:,:,:
14446,"YR,T,Y_LT1,UK",:,:,:,81.3,81.3,81.2,81.0,81.4,81.1,...,:,:,:,:,:,:,:,:,:,:


In [6]:
new_cols = df['unit,sex,age,geo\\time'].str.split(',', expand=True)

In [7]:
new_cols.columns = ['unit', 'sex', 'age', 'region']

In [8]:
df = pd.concat([df, new_cols], axis=1)

In [9]:
df = df.drop(['unit,sex,age,geo\\time'], axis=1)

In [10]:
df

Unnamed: 0,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,...,1965,1964,1963,1962,1961,1960,unit,sex,age,region
0,:,79.4,80.4,80.2,79.7,79.8,79.2,79.8,79.6,:,...,:,:,:,:,:,:,YR,F,Y1,AL
1,:,:,79.1,79.2,78.5,78.0,77.9,:,:,:,...,:,:,:,:,:,:,YR,F,Y1,AM
2,:,82.9,83.5,83.3,83.2,83.4,83.0,83.3,83.0,82.8,...,:,:,:,:,:,:,YR,F,Y1,AT
3,:,:,78.6,78.1,77.7,:,77.5,77.0,76.9,76.5,...,:,:,:,:,:,:,YR,F,Y1,AZ
4,:,82.3,83.6,83.2,83.2,83.2,82.6,83.1,82.5,82.4,...,74.3,74.6,73.9,74.0,74.5,73.7,YR,F,Y1,BE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14443,:,:,:,:,:,:,:,:,:,85.6,...,:,:,:,:,:,:,YR,T,Y_LT1,SM
14444,:,:,79.1,78.9,78.5,78.1,78.2,78.1,78.2,77.6,...,:,:,:,:,:,:,YR,T,Y_LT1,TR
14445,:,:,73.4,73.2,73.3,72.9,72.5,71.8,:,71.2,...,:,:,:,:,:,:,YR,T,Y_LT1,UA
14446,:,:,:,81.3,81.3,81.2,81.0,81.4,81.1,81.0,...,:,:,:,:,:,:,YR,T,Y_LT1,UK


In [11]:
df = df.melt(id_vars=["unit", "sex", "age", "region"], var_name="year", value_name="value")

In [12]:
df

Unnamed: 0,unit,sex,age,region,year,value
0,YR,F,Y1,AL,2021,:
1,YR,F,Y1,AM,2021,:
2,YR,F,Y1,AT,2021,:
3,YR,F,Y1,AZ,2021,:
4,YR,F,Y1,BE,2021,:
...,...,...,...,...,...,...
895771,YR,T,Y_LT1,SM,1960,:
895772,YR,T,Y_LT1,TR,1960,:
895773,YR,T,Y_LT1,UA,1960,:
895774,YR,T,Y_LT1,UK,1960,:


In [13]:
df.to_csv("/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/tests/fixtures/numeric.csv")

In [15]:
df["value"]= df["value"].replace(to_replace=r'[^\d\.]+', value='', regex=True)
df["value"] = df["value"].replace(to_replace='', value=np.nan)
df["value"] = pd.to_numeric(df["value"], errors='coerce')


In [16]:
df

Unnamed: 0,unit,sex,age,region,year,value
0,YR,F,Y1,AL,2021,
1,YR,F,Y1,AM,2021,
2,YR,F,Y1,AT,2021,
3,YR,F,Y1,AZ,2021,
4,YR,F,Y1,BE,2021,
...,...,...,...,...,...,...
895771,YR,T,Y_LT1,SM,1960,
895772,YR,T,Y_LT1,TR,1960,
895773,YR,T,Y_LT1,UA,1960,
895774,YR,T,Y_LT1,UK,1960,


In [17]:
df.to_csv("/Users/danielneves/00_DareData/090_pod_5/assignments/life_expectancy/tests/fixtures/numeric_expected.csv")

In [34]:
def test_extract_numeric_values_from_column() -> None:
    """Run the `clean_data` function and compare the output to the expected output"""
    numeric = pd.read_csv(os.path.join(FIXTURE_dir, "numeric.csv"))

    actual = extract_numeric_values_from_column(numeric, "value")#.reset_index(drop=True)
    expected = pd.read_csv(os.path.join(FIXTURE_dir, "numeric_expected.csv"))
    a = pd.testing.assert_frame_equal(
        actual, expected
    )
    print(a)
    return

In [35]:
test_extract_numeric_values_from_column()

None
