In [12]:
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
# Refactor to config.yaml
datadir = Path('../data')

# Life expectancy

In [3]:
life_exp = pd.read_excel(Path(datadir, 'life_expectancy_at_birth.xlsx'))
life_exp.head()

Unnamed: 0,Year,WHO Region,World Bank Income Group,Country ISO Code,Country,Sex,Global,Value
0,1950,Europe,High income,NLD,Netherlands,Both sexes,,71.411
1,1950,Europe,High income,NLD,Netherlands,Female,,72.615
2,1951,Europe,High income,NLD,Netherlands,Both sexes,,71.55
3,1950,Europe,High income,NLD,Netherlands,Male,,70.236
4,1951,Europe,High income,NLD,Netherlands,Female,,72.775


In [4]:
# columns lower
life_exp.columns = life_exp.columns.str.lower()
life_exp.head(1)

Unnamed: 0,year,who region,world bank income group,country iso code,country,sex,global,value
0,1950,Europe,High income,NLD,Netherlands,Both sexes,,71.411


In [25]:
# sort values on year
life_exp = life_exp.sort_values('year')

In [9]:
# columns drop
to_keep = ['year', 'country', 'sex', 'value']
life_exp = life_exp[to_keep]
life_exp.head(1)

Unnamed: 0,year,country,sex,value
0,1950,Netherlands,Both sexes,71.411


In [11]:
# rename
mapping = {
    'value': 'life expectancy (age)'
}
life_exp = life_exp.rename(columns=mapping)
life_exp.head(1)

Unnamed: 0,year,country,sex,life expectancy (age)
0,1950,Netherlands,Both sexes,71.411


In [17]:
# dtypes
mapping_dtype = {
    'year': 'int', 
    'country': 'str', 
    'sex': 'str', 
    'life expectancy (age)': 'float'
}
life_exp = life_exp.astype(mapping_dtype)
life_exp.dtypes

year                       int64
country                   object
sex                       object
life expectancy (age)    float64
dtype: object

In [19]:
# missing values
life_exp.isna().sum() == 0

year                     True
country                  True
sex                      True
life expectancy (age)    True
dtype: bool

In [21]:
# sex mapping
mapping_sex = {
    'Male': 1,
    'Female': 2,
    'Both sexes': 3
}

def convert_sex(x):
    return mapping_sex[x]

life_exp['sex'] = life_exp['sex'].apply(convert_sex)

In [26]:
life_exp.head(5)

Unnamed: 0,year,country,sex,life expectancy (age)
0,1950,Netherlands,3,71.411
1,1950,Netherlands,2,72.615
3,1950,Netherlands,1,70.236
2,1951,Netherlands,3,71.55
4,1951,Netherlands,2,72.775


# Country codes

In [27]:
country_codes = pd.read_csv(Path(datadir, 'country_codes'))
country_codes.head(1)

Unnamed: 0,country,name
0,1010,Algeria


In [28]:
# to lowercase column

In [29]:
country_codes.isna().sum()

country    0
name       0
dtype: int64

In [30]:
country_codes.dtypes

country     int64
name       object
dtype: object

# Population

In [45]:
df = pd.read_csv(Path(datadir, 'pop'))
df.head()

Unnamed: 0,Country,Admin1,SubDiv,Year,Sex,Frmat,Pop1,Pop2,Pop3,Pop4,...,Pop18,Pop19,Pop20,Pop21,Pop22,Pop23,Pop24,Pop25,Pop26,Lb
0,1060,,,1980,1,7,137100.0,3400.0,15800.0,,...,,5300.0,,2900.0,,,,,6500.0,5000.0
1,1060,,,1980,2,7,159000.0,4000.0,18400.0,,...,,6200.0,,3400.0,,,,,7500.0,6000.0
2,1125,,,1955,1,2,5051500.0,150300.0,543400.0,,...,110200.0,51100.0,41600.0,14300.0,11800.0,25300.0,,,0.0,253329.0
3,1125,,,1955,2,2,5049400.0,145200.0,551000.0,,...,122100.0,51100.0,50700.0,15800.0,18000.0,28500.0,,,0.0,237901.0
4,1125,,,1956,1,2,5353700.0,158700.0,576600.0,,...,116900.0,54100.0,44000.0,14900.0,12400.0,26600.0,,,0.0,250022.0


# Mortality

In [1]:
MORTALITY_FILE = 'Morticd10_part'  # append 1 till 5 and concatenate the 5 files together
DATA_DIR = '../data'

In [4]:
mortality_datasets = [Path(DATA_DIR, f"{MORTALITY_FILE}{str(i + 1)}") for i in range(0, 5)]
mortality_datasets = [pd.read_csv(data_path) for data_path in mortality_datasets]

  mortality_datasets = [pd.read_csv(data_path) for data_path in mortality_datasets]
  mortality_datasets = [pd.read_csv(data_path) for data_path in mortality_datasets]


In [5]:
df_mortality = pd.concat(mortality_datasets)

In [10]:
df_mortality['Sex'].isna().sum()

1

In [17]:
df_mortality[df_mortality['Deaths1'].isna()]

Unnamed: 0,Country,Admin1,SubDiv,Year,List,Cause,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths21,Deaths22,Deaths23,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4


In [16]:
df_mortality.dropna(subset=['Sex'], how='all')

Country             0
Admin1        4125577
SubDiv        4203112
Year                0
List                0
Cause               0
Sex                 0
Frmat               0
IM_Frmat            0
Deaths1             0
Deaths2          2242
Deaths3          2242
Deaths4        195052
Deaths5        195052
Deaths6        195052
Deaths7          2242
Deaths8          5824
Deaths9          2242
Deaths10         5824
Deaths11         2242
Deaths12         5824
Deaths13         2242
Deaths14         5824
Deaths15         2242
Deaths16         5824
Deaths17         2242
Deaths18         5824
Deaths19         2242
Deaths20         6148
Deaths21         2566
Deaths22        21274
Deaths23        21274
Deaths24       559380
Deaths25       559380
Deaths26         2242
IM_Deaths1       2243
IM_Deaths2     908028
IM_Deaths3     835895
IM_Deaths4     835895
dtype: int64