# Распределение причин смерти среди детей в возрасте до 5 лет (%)

In [1]:
import sys
# Добавим папку с корнем проекта в список системных директорий, чтобы Python видел путь к папке utils
sys.path.append('..')

In [2]:
import pandas as pd

from utils.constants import F

from warnings import simplefilter
simplefilter('ignore')

Эти данные можно найти на сайте ВОЗ по ссылке \
https://www.who.int/data/gho/data/indicators/indicator-details/GHO/distribution-of-causes-of-death-among-children-aged-5-years-(-)

Данные на сайте представлены только за 2000-2017 гг. \
Поэтому не будем их объединять с основной таблицей в файле [01_main.ipynb](./01_main.ipynb), так как в ней собираются данные за 2000-2020 гг.\
Предикторами эти данные все равно не будут, так как из факторов смертности складывается значение целевой переменной ожидаемой продолжительности жизни. Если их использовать как предикторы, это будет утечка данных.\
Используем их позже для исследований.

In [3]:
# Загрузим таблицу
data = pd.read_csv('../data/who_child_0_5_mortality.csv')

data.head(2)

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,MORT_300,Distribution of causes of death among children...,numeric,EMR,Eastern Mediterranean,Country,AFG,Afghanistan,Year,2017,...,,,,,,0.0,,,EN,2018-11-26T21:00:00.000Z
1,MORT_300,Distribution of causes of death among children...,numeric,EMR,Eastern Mediterranean,Country,AFG,Afghanistan,Year,2017,...,,,,,,0.0,,,EN,2018-11-26T21:00:00.000Z


Описание нужных нам столбцов.

- `ParentLocationCode` - код региона
- `ParentLocation` - название региона
- `SpatialDimValueCode` - код страны
- `Location` - название страны
- `Period` - год
- `Dim2` - описание причины  
- `Dim2ValueCode` - код причины
- `FactValueNumeric` - распределение причин смерти среди детей в возрасте до 5 лет (%)

Оставим только их.

In [4]:
data = data[[
    'ParentLocationCode',
    'ParentLocation',
    'SpatialDimValueCode',
    'Location',
    'Period',
    'Dim2',
    'Dim2ValueCode',
    'FactValueNumeric',
]]

data.head(3)

Unnamed: 0,ParentLocationCode,ParentLocation,SpatialDimValueCode,Location,Period,Dim2,Dim2ValueCode,FactValueNumeric
0,EMR,Eastern Mediterranean,AFG,Afghanistan,2017,HIV/AIDS,CHILDCAUSE_CH2,0.0002
1,EMR,Eastern Mediterranean,AFG,Afghanistan,2017,Tetanus,CHILDCAUSE_CH5,0.016
2,EMR,Eastern Mediterranean,AFG,Afghanistan,2017,Measles,CHILDCAUSE_CH6,0.018


Посмотрим на типы столбцов.

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48888 entries, 0 to 48887
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ParentLocationCode   48888 non-null  object 
 1   ParentLocation       48888 non-null  object 
 2   SpatialDimValueCode  48888 non-null  object 
 3   Location             48888 non-null  object 
 4   Period               48888 non-null  int64  
 5   Dim2                 48888 non-null  object 
 6   Dim2ValueCode        48888 non-null  object 
 7   FactValueNumeric     48888 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 3.0+ MB


Типы столбцов соответствуют своему содержимому.

Посмотрим на наличие пропусков.

In [6]:
data.isna().sum()

ParentLocationCode     0
ParentLocation         0
SpatialDimValueCode    0
Location               0
Period                 0
Dim2                   0
Dim2ValueCode          0
FactValueNumeric       0
dtype: int64

Пропусков нет.

In [7]:
# Посмотрим за какие года представлены данные и запомним эти значения
year_min = data['Period'].min()
year_max = data['Period'].max()

print(f"Данные представлены за {year_min} - {year_max} гг.")

Данные представлены за 2000 - 2017 гг.


In [8]:
# Составим список с кодами причин смертности
cause_codes = list(data['Dim2ValueCode'].unique())
print('Итоговое количество причин', len(cause_codes))

# Составим список кодов стран
location_codes = list(data['SpatialDimValueCode'].unique())
print('Итоговое количество стран', len(location_codes))

Итоговое количество причин 14
Итоговое количество стран 194


Создадим таблицу, в которой значения смертности сделаем столбцами.

In [9]:
cause_table_columns = ['SpatialDimValueCode', 'Period'] + cause_codes

cause_data = pd.DataFrame(columns=cause_table_columns)
cause_data

Unnamed: 0,SpatialDimValueCode,Period,CHILDCAUSE_CH2,CHILDCAUSE_CH5,CHILDCAUSE_CH6,CHILDCAUSE_CH7,CHILDCAUSE_CH8,CHILDCAUSE_CH12,CHILDCAUSE_CH3,CHILDCAUSE_CH11,CHILDCAUSE_CH13,CHILDCAUSE_CH17,CHILDCAUSE_CH9,CHILDCAUSE_CH16,CHILDCAUSE_CH15,CHILDCAUSE_CH10


In [10]:
# Заполним таблицу
def get_cause_value(SpatialDimValueCode: str, Period: int, Dim2ValueCode: str) -> float:
    """Возвращает процент смертей для переданных параметров

    Args:
        SpatialDimValueCode (str): код страны
        Period (int): год
        Dim2ValueCode (str): код причины

    Returns:
        float: процент смертей для переданных параметров
    """
    # Создаем маску для переданных параметров
    mask = (data['SpatialDimValueCode'] == SpatialDimValueCode) & \
        (data['Period'] == Period) & \
        (data['Dim2ValueCode'] == Dim2ValueCode)
        
    # Вернем значение для переданных параметров
    return float(data[mask]['FactValueNumeric'].values[0])


# Цикл по годам 
for year in range(year_min, year_max+1):
    # Создадим пустую таблицу для данного года
    year_data = pd.DataFrame(columns=cause_table_columns)
    # Заполним столбец с кодами стран
    year_data['SpatialDimValueCode'] = location_codes
    # Заполним столбец с годом
    year_data['Period'] = year
    
    # Теперь заполним столбцы с причинами    
    for cause_field in cause_codes:
        year_data[cause_field] = year_data.apply(
            lambda x: get_cause_value(
                x['SpatialDimValueCode'],
                year,
                cause_field,
            ), axis=1
        )
    
    # Присоединим полученную таблицу к основной
    cause_data = pd.concat(
        [cause_data, year_data], 
        ignore_index=True
    )

# Переведем столбцы с причинами в числовые
cause_data[cause_codes] = cause_data[cause_codes].apply(pd.to_numeric)

# Посмотрим на результат
cause_data.head()

Unnamed: 0,SpatialDimValueCode,Period,CHILDCAUSE_CH2,CHILDCAUSE_CH5,CHILDCAUSE_CH6,CHILDCAUSE_CH7,CHILDCAUSE_CH8,CHILDCAUSE_CH12,CHILDCAUSE_CH3,CHILDCAUSE_CH11,CHILDCAUSE_CH13,CHILDCAUSE_CH17,CHILDCAUSE_CH9,CHILDCAUSE_CH16,CHILDCAUSE_CH15,CHILDCAUSE_CH10
0,AFG,2000,0.0001,0.086,0.084,0.044,0.002,0.038,0.12,0.13,0.061,0.039,0.18,0.036,0.036,0.14
1,ALB,2000,0.0,0.003,0.001,0.039,0.0,0.02,0.029,0.059,0.056,0.1,0.2,0.087,0.2,0.2
2,DZA,2000,0.0002,0.004,0.045,0.017,0.0,0.062,0.082,0.12,0.049,0.053,0.16,0.077,0.11,0.23
3,AND,2000,0.0,0.0,0.0,0.002,0.0,0.0,0.0003,0.0,0.003,0.001,0.005,0.16,0.16,0.66
4,AGO,2000,0.007,0.016,0.011,0.052,0.098,0.029,0.2,0.095,0.091,0.036,0.21,0.027,0.038,0.087


In [11]:
# Посмотрим на типы столбцов
cause_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3492 entries, 0 to 3491
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   SpatialDimValueCode  3492 non-null   object 
 1   Period               3492 non-null   object 
 2   CHILDCAUSE_CH2       3492 non-null   float64
 3   CHILDCAUSE_CH5       3492 non-null   float64
 4   CHILDCAUSE_CH6       3492 non-null   float64
 5   CHILDCAUSE_CH7       3492 non-null   float64
 6   CHILDCAUSE_CH8       3492 non-null   float64
 7   CHILDCAUSE_CH12      3492 non-null   float64
 8   CHILDCAUSE_CH3       3492 non-null   float64
 9   CHILDCAUSE_CH11      3492 non-null   float64
 10  CHILDCAUSE_CH13      3492 non-null   float64
 11  CHILDCAUSE_CH17      3492 non-null   float64
 12  CHILDCAUSE_CH9       3492 non-null   float64
 13  CHILDCAUSE_CH16      3492 non-null   float64
 14  CHILDCAUSE_CH15      3492 non-null   float64
 15  CHILDCAUSE_CH10      3492 non-null   f

Типы столбцов соответствуют своему содержимому.

Создадим таблицу с описанием кодов причин смертности.

In [12]:
cause_codes_data = data.groupby(['Dim2ValueCode']).first().reset_index()
cause_codes_data = cause_codes_data[['Dim2ValueCode', 'Dim2']]
cause_codes_data.head(3)

Unnamed: 0,Dim2ValueCode,Dim2
0,CHILDCAUSE_CH10,Prematurity
1,CHILDCAUSE_CH11,Birth asphyxia and birth trauma
2,CHILDCAUSE_CH12,Sepsis and other infectious conditions of the ...


Переименуем столбцы.

In [13]:
cause_codes_data.rename(
    columns={ 'Dim2ValueCode': 'WhoCauseCode', 'Dim2': 'Description' },
    inplace=True,
)
cause_codes_data.head(3)

Unnamed: 0,WhoCauseCode,Description
0,CHILDCAUSE_CH10,Prematurity
1,CHILDCAUSE_CH11,Birth asphyxia and birth trauma
2,CHILDCAUSE_CH12,Sepsis and other infectious conditions of the ...


In [14]:
# Соберем словарь с соответствием кода причины и имен новых столбцов
rename_dict = {
    'CHILDCAUSE_CH2': 'ChildUnder5Mortality2',
    'CHILDCAUSE_CH3': 'ChildUnder5Mortality3',
    'CHILDCAUSE_CH5': 'ChildUnder5Mortality5',
    'CHILDCAUSE_CH6': 'ChildUnder5Mortality6',
    'CHILDCAUSE_CH7': 'ChildUnder5Mortality7',
    'CHILDCAUSE_CH8': 'ChildUnder5Mortality8',
    'CHILDCAUSE_CH9': 'ChildUnder5Mortality9',
    'CHILDCAUSE_CH10': 'ChildUnder5Mortality10',
    'CHILDCAUSE_CH11': 'ChildUnder5Mortality11',
    'CHILDCAUSE_CH12': 'ChildUnder5Mortality12',
    'CHILDCAUSE_CH13': 'ChildUnder5Mortality13',
    'CHILDCAUSE_CH15': 'ChildUnder5Mortality15',
    'CHILDCAUSE_CH16': 'ChildUnder5Mortality16',
    'CHILDCAUSE_CH17': 'ChildUnder5Mortality17',
}

# Добавим имя нового столбца в таблицу
cause_codes_data['CauseCode'] = cause_codes_data['WhoCauseCode'].apply(
    lambda who_code: rename_dict[who_code]
)
cause_codes_data.head(3)

Unnamed: 0,WhoCauseCode,Description,CauseCode
0,CHILDCAUSE_CH10,Prematurity,ChildUnder5Mortality10
1,CHILDCAUSE_CH11,Birth asphyxia and birth trauma,ChildUnder5Mortality11
2,CHILDCAUSE_CH12,Sepsis and other infectious conditions of the ...,ChildUnder5Mortality12


In [15]:
# Сохраним полученную таблицу
cause_codes_data.to_csv('../data/child_0_5_mortality_codes.csv', index=False)

Переименуем столбцы в таблице с данными о смертности.

In [16]:
# В нашей таблице переименуем столбцы 
cause_data.rename(
    columns=rename_dict,
    inplace=True,
)
cause_data.head(3)

Unnamed: 0,SpatialDimValueCode,Period,ChildUnder5Mortality2,ChildUnder5Mortality5,ChildUnder5Mortality6,ChildUnder5Mortality7,ChildUnder5Mortality8,ChildUnder5Mortality12,ChildUnder5Mortality3,ChildUnder5Mortality11,ChildUnder5Mortality13,ChildUnder5Mortality17,ChildUnder5Mortality9,ChildUnder5Mortality16,ChildUnder5Mortality15,ChildUnder5Mortality10
0,AFG,2000,0.0001,0.086,0.084,0.044,0.002,0.038,0.12,0.13,0.061,0.039,0.18,0.036,0.036,0.14
1,ALB,2000,0.0,0.003,0.001,0.039,0.0,0.02,0.029,0.059,0.056,0.1,0.2,0.087,0.2,0.2
2,DZA,2000,0.0002,0.004,0.045,0.017,0.0,0.062,0.082,0.12,0.049,0.053,0.16,0.077,0.11,0.23


In [17]:
# Сохраним полученную таблицу
cause_data.to_csv('../data/who_child_0_5_mortality_prepared.csv', index=False)