In [1]:
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List
from functools import partial, reduce

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
DATA_CLEANED_DIR = Path('../data/cleaned/')

In [3]:
# import files
df_country_codes = pd.read_csv(Path(DATA_CLEANED_DIR, 'country_codes.csv'))
df_life_exp = pd.read_csv(Path(DATA_CLEANED_DIR, 'life_expectancy.csv'))
df_mortality = pd.read_csv(Path(DATA_CLEANED_DIR, 'mortality.csv'))
df_population = pd.read_csv(Path(DATA_CLEANED_DIR, 'population.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


# Netherlands
We first want to visualize the data of The Netherlands

In [4]:
df_country_codes.head(1)
df_country_codes[df_country_codes['country'] == 'Netherlands']

Unnamed: 0,country code,country
186,4210,Netherlands


As we can see, the country code of The Netherlands is `4210`. Therefore we will filter the *life expectancy*, *mortality* and *population datasets on this country code.

In [5]:
NL_CODE = 4210

At the moment the **Life expectancy** dataset contains only data for the Netherlands. The data is obtained from https://www.who.int/data/maternal-newborn-child-adolescent-ageing/indicator-explorer-new/mca/life-expectancy-at-birth. Due to the fact that the *Export* button did not work at that time, I copied it manually in the dataset. 

In [6]:
nl_life_exp = df_life_exp[['year', 'sex', 'life expectancy [age]']]
nl_mortality = df_mortality[df_mortality['country code'] == NL_CODE]
nl_population = df_population[df_population['country code'] == NL_CODE]

So it is best to merge them into one single file. In this way we can standardize the values. You want to merge them on `year` and `sex`.

Coding: 
`{1: male, 2: female, 3: both sexes}`

In [7]:
nl_life_exp.sex.unique()

array([3, 2, 1], dtype=int64)

In [8]:
nl_mortality.sex.unique()

array([1, 2], dtype=int64)

In [9]:
nl_population.sex.unique()

array([1, 2], dtype=int64)

#### It is first important to create an aggregate of sexes for the *mortality* and *population* dataset.

In [10]:
def generate_ICD_codes(lower, upper, symbol):
    codes = []
    for i in range(lower, upper + 1, 1):
        if i < 10:
            codes.append(f'{symbol}0{i}')
        else:
            codes.append(f'{symbol}{i}')

    return np.array(codes)


def convert_format(series, n=3):
    """Only keep the n first characters of the column"""
    return series.apply(lambda x: x[:n])


def filter_column(df: pd.DataFrame, column: str, elements):
    """


    :param df:
    :param column:
    :param elements:
    :return:
    """
    target = df[column].unique()
    found_elements = find_elements(target, elements)
    dataset = df[df[column].isin(found_elements)]

    return dataset


def groupby_sum(df, by, on):
    """Groups the dataframe by the index columns, and sums the target column and
    returns the result as a dataframe."""

    grouped = df.groupby(by, as_index=False)[on].sum()

    return grouped


def find_elements(target, elements):
    """
    Checks if the targets can be found in an arbitrary list of elements.

    :param elements:
    :param target:
    :return:
    """
    mask = np.isin(target, elements)
    found = np.where(mask, target, '')
    valid = [c for c in found if c != '']

    return valid

In [11]:
class Selector(ABC):

    def __init__(self, file):
        self.file = file

    @abstractmethod
    def get_selection(self):
        pass


class DataFrameSelector(Selector):

    def __init__(self, df):
        if not isinstance(df, pd.DataFrame):
            raise ValueError('Expects a pandas DataFrame.')
        super().__init__(df)

        self.selection = {}

    def filter_column(self, column: str, elements):
        target = self.file[column].unique()
        found_elements = find_elements(target, elements)
        dataset = self.file[self.file[column].isin(found_elements)]

        return dataset

    def split_dataframe(self, column: str, labels: List[str], selection: List[np.ndarray]):
        """Expects labels as keys and the selection to be the string to select the dataframe on.

        if unique elements are given then it searches for elements found in the selection and the unique list
        """

        for label, selector in zip(labels, selection):
            dataset = self.filter_column(column, selector)
            self.selection[label] = dataset
            
    def rename_selection(self, column, mapping):
        for k, df in self.selection.items():
            self.selection[k] = df.rename(columns={column: mapping[k]})

    def get_selection(self):
        return self.selection

In [12]:
class Aggregator:

    def __init__(self, df):
        self.df = df
        self.aggregation = None

    def handler(self, df):
        if self.aggregation is None:
            self.aggregation = df
        else:
            self.aggregation = self.aggregation.append(df)

    def calc_aggr(self, by, on, column='', value=None):
        # calculate aggregate and append it to the original
        if not column and not value:
            self.handler(groupby_sum(self.df, by, on))
            return

        aggr = groupby_sum(self.df, by, on)
        aggr[column] = value

        self.handler(aggr)

    def get_aggregation(self, sort_by):
        return self.aggregation.sort_values(sort_by).reset_index(drop=True)

In [34]:
C_codes = generate_ICD_codes(0, 97, 'C')
I_codes = generate_ICD_codes(5, 99, 'I')
E_codes = generate_ICD_codes(10, 13, 'E')
J_codes = generate_ICD_codes(40, 47, 'J')
K_codes = generate_ICD_codes(0, 93, 'K')

code_to_name_map = {
    'C': 'cancer [deaths]',
    'I': 'cardiovascular disease [deaths]',
    'E': 'diabetes mellitus [deaths]',
    'J': 'chronic respiratory diseases [deaths]',
    'K': 'diseases of digestive system [deaths]'
}

name_to_code_map = {
    'cancer [deaths]': 'C',
    'cardiovascular disease [deaths]': 'I',
    'diabetes mellitus [deaths]': 'E',
    'chronic respiratory diseases [deaths]': 'J',
    'diseases of digestive system [deaths]': 'K',
}

code_map = {
    'C': 'C',
    'I': 'I',
    'E': 'E',
    'J': 'J',
    'K': 'K'
}

codes = [C_codes, I_codes, E_codes, J_codes, K_codes]
labels = ['C', 'I', 'E', 'J', 'K']

In [14]:
nl_causes = convert_format(nl_mortality['cause'], 3)
nl_mortality.loc[:, 'cause'] = nl_causes

nl_unique_causes = nl_causes.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [15]:
# Select correct data
mortality_selector = DataFrameSelector(nl_mortality)
mortality_selector.split_dataframe('cause', labels, codes)
mortality_selector.rename_selection('deaths', code_to_name_map)
mortality_sets = mortality_selector.get_selection()

# create aggregates
for k, df in mortality_sets.items():
    aggr = Aggregator(df)
    aggr.calc_aggr(by=['year', 'sex'], on=code_to_name_map[k])
    aggr.calc_aggr(by=['year'], on=code_to_name_map[k], column='sex', value=3)
    
    mortality_sets[k] = aggr.get_aggregation(sort_by='year')

In [24]:
mortality_sets

{'C':     year  sex  cancer [deaths]
 0   1996    1            20754
 1   1996    2            16466
 2   1996    3            37220
 3   1997    1            20420
 4   1997    2            16713
 ..   ...  ...              ...
 64  2017    1            24547
 65  2017    3            44908
 66  2018    1            24363
 67  2018    2            20407
 68  2018    3            44770
 
 [69 rows x 3 columns],
 'I':     year  sex  cardiovascular disease [deaths]
 0   1996    1                            25205
 1   1996    2                            26102
 2   1996    3                            51307
 3   1997    1                            24309
 4   1997    2                            25448
 ..   ...  ...                              ...
 64  2017    1                            18093
 65  2017    3                            38145
 66  2018    1                            18264
 67  2018    2                            19520
 68  2018    3                            37784
 
 [

### Population 
Population is already properly formatted, thus no need to make a selection.


In [16]:
aggr = Aggregator(nl_population)
aggr.calc_aggr(by=['year', 'sex'], on='population')
aggr.calc_aggr(by=['year'], on='population', column='sex', value=3)
nl_pop_agg = aggr.get_aggregation(sort_by='year')
nl_pop_agg

Unnamed: 0,year,sex,population
0,1950,1,5041000.0
1,1950,2,5072500.0
2,1950,3,10113500.0
3,1951,1,5114800.0
4,1951,2,5149500.0
...,...,...,...
202,2017,2,8632105.0
203,2017,3,17133498.0
204,2018,1,8597564.0
205,2018,2,8718843.0


## Combine all 

In [28]:
nl_pop_agg = nl_pop_agg[nl_pop_agg['year'] > 1995]
nl_life_exp = nl_life_exp[(nl_life_exp['year'] > 1995) & (nl_life_exp['year'] < 2019)]

In [29]:
nl_pop_agg.head()

Unnamed: 0,year,sex,population
138,1996,2,7851000.0
139,1996,1,7679500.0
140,1996,3,15530500.0
141,1997,1,7718400.0
142,1997,3,15610600.0


In [30]:
nl_life_exp.head()

Unnamed: 0,year,sex,life expectancy [age]
137,1996,3,77.642
139,1996,2,80.378
140,1996,1,74.783
141,1997,1,74.963
142,1998,3,77.889


In [31]:
dfs = [df for df in mortality_sets.values()]
dfs.append(nl_pop_agg)
dfs.append(nl_life_exp)

In [32]:
def multi_merge(dfs, on):
    merge = partial(pd.merge, on=on, how='outer')
    dataset = reduce(merge, dfs)
    
    return dataset

In [33]:
dataset = multi_merge(dfs, on=['year', 'sex']).sort_values('year')
dataset

Unnamed: 0,year,sex,cancer [deaths],cardiovascular disease [deaths],diabetes mellitus [deaths],chronic respiratory diseases [deaths],diseases of digestive system [deaths],population,life expectancy [age]
0,1996,1,20754,25205,382,4322,2206,7679500.0,74.783
1,1996,2,16466,26102,514,2166,2951,7851000.0,80.378
2,1996,3,37220,51307,896,6488,5157,15530500.0,77.642
3,1997,1,20420,24309,339,4096,2128,7718400.0,74.963
4,1997,2,16713,25448,561,2260,2883,7892200.0,80.435
...,...,...,...,...,...,...,...,...,...
64,2017,1,24547,18093,454,3521,2090,8501393.0,80.244
65,2017,3,44908,38145,928,7016,4475,17133498.0,82.004
67,2018,2,20407,19520,463,3572,2465,8718843.0,83.838
66,2018,1,24363,18264,475,3513,2258,8597564.0,80.419


In [35]:
dataset['non-communicable chronic disease [deaths]'] = 0
for disease in name_to_code_map.keys():
    dataset['non-communicable chronic disease [deaths]'] += dataset[disease]

In [36]:
dataset

Unnamed: 0,year,sex,cancer [deaths],cardiovascular disease [deaths],diabetes mellitus [deaths],chronic respiratory diseases [deaths],diseases of digestive system [deaths],population,life expectancy [age],non-communicable chronic disease [deaths]
0,1996,1,20754,25205,382,4322,2206,7679500.0,74.783,52869
1,1996,2,16466,26102,514,2166,2951,7851000.0,80.378,48199
2,1996,3,37220,51307,896,6488,5157,15530500.0,77.642,101068
3,1997,1,20420,24309,339,4096,2128,7718400.0,74.963,51292
4,1997,2,16713,25448,561,2260,2883,7892200.0,80.435,47865
...,...,...,...,...,...,...,...,...,...,...
64,2017,1,24547,18093,454,3521,2090,8501393.0,80.244,48705
65,2017,3,44908,38145,928,7016,4475,17133498.0,82.004,95472
67,2018,2,20407,19520,463,3572,2465,8718843.0,83.838,46427
66,2018,1,24363,18264,475,3513,2258,8597564.0,80.419,48873


In [None]:
dataset.to_csv