In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
#cardio_m = pd.read_csv('Data/new_cardio_mortality.csv')
#mortality = pd.read_csv('Data/new_mortality.csv')
cardiovascular_diseases = pd.read_csv('Data/cardio_train.csv', sep=';', index_col='id')

In [3]:
cardiovascular_diseases.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
cardiovascular_diseases['age_in_year'] = cardiovascular_diseases['age'].apply(lambda a: math.floor(a/365))
cardiovascular_diseases.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47


In [5]:
cardiovascular_diseases['gender_f'] = cardiovascular_diseases['gender'].apply(lambda g: 1 if(g==1) else 0)
cardiovascular_diseases.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_year,gender_f
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,0
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,1


In [6]:
card = cardiovascular_diseases.copy()

# 1. Getting the data ready for the "selection"

We need binary aggraving factors to allow the user to simply select the condition as a "yes" or "no".
In the current state, the BMI, cholesterol and the two blood pressures are categorical or numerical, hence we need to convert them to binary values. 
To do so, we will use the usual medical "categories" used to know if a certain BMI or blood pressure is normal or endangering.


## 1.1_ Blood Pressures

https://www.heart.org/en/health-topics/high-blood-pressure/understanding-blood-pressure-readings

Using this source, we know that blood pressures can be separated in the following categories:
   - NORMAL
       - < 120 mm Hg for Systolic blood pressure AND
       - < 80 mm Hg for Diastolic blood pressure
   - ELEVATED 
       - 120 – 129 mm Hg for Systolic blood pressure AND
       - < 80 mm Hg for Diastolic blood pressure
   - HIGH BLOOD PRESSURE – (HYPERTENSION) STAGE 1
       - 130 – 139 mm Hg for Systolic blood pressure OR
       - 80 – 89 mm Hg for Diastolic blood pressure
   - HIGH BLOOD PRESSURE – (HYPERTENSION) STAGE 2
       - ≥ 140 mm Hg for Systolic blood pressure OR
       - ≥ 90 mm Hg for Diastolic blood pressure
   - HYPERTENSIVE CRISIS – (consult your doctor immediately)
       - ≥ 180 mm Hg for Systolic blood pressure AND/OR
       - ≥ 120 mm Hg for Diastolic blood pressure
       
From this, we can derive 2 categories for the column __"blood_pressure"__: 
   - Normal/Acceptable Blood pressure – encoded as __0__ (meaning "not dangerous blood pressure")
       - Corresponding to categories NROMAL and ELEVATED (risk of developping hypertension but not yet)
   - Hypertension, Stage 1, 2 and Crisis – encoded as __1__ (meaning "dangerous blood pressure")
       - Categories HIGH BLOOD PRESSURE and HYPERTENSIVE CRISIS

In [7]:
def categorize_blood_pressure(systolic, diastolic):
    if(diastolic < 80):
        return 1 if (systolic >= 130) else 0
    else: return 1

In [8]:
card['blood_pressure'] = card.apply(lambda row: categorize_blood_pressure(row['ap_hi'], row['ap_lo']), axis=1)
card.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_year,gender_f,blood_pressure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,0,1
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,1,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,0,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,1,0


## 1.2_ BMI 

https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html

Using this source, we can divide the BMI's in the following categories:
 - Underweight
    - < 18.5
 - Normal or Healthy Weight
    - 18.5 – 24.9
 - Overweight
    - 25.0 – 29.9	
 - Obese
    - ≥ 30.0
    
From this we will derive 2 columns: __"under_weight"__ and __"over_weight"__
 - under_weight
     - __1__ if BMI < 18.5 
     - __0__ otherwise
 - over_weight
     - __1__ if BMI ≥ 25.0
     - __0__ otherwise

In [9]:
def compute_BMI(weight, height):
    return weight/((height/100)**2)

In [10]:
def is_underweight(weight, height):
    return 1 if(compute_BMI(weight, height) < 18.5) else 0

def is_overweight(weight, height):
    return 1 if(compute_BMI(weight, height) >= 25.0) else 0

In [11]:
card['BMI'] = card.apply(lambda row: compute_BMI(row['weight'], row['height']), axis=1)
card.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_year,gender_f,blood_pressure,BMI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,0,1,21.96712
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,1,1,34.927679
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,1,1,23.507805
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,0,1,28.710479
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,1,0,23.011177


In [12]:
card['under_weight'] = card.apply(lambda row: is_underweight(row['weight'], row['height']), axis=1)
card['over_weight'] = card.apply(lambda row: is_overweight(row['weight'], row['height']), axis=1)
card.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_year,gender_f,blood_pressure,BMI,under_weight,over_weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,0,1,21.96712,0,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,1,1,34.927679,0,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,1,1,23.507805,0,0
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,0,1,28.710479,0,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,1,0,23.011177,0,0


In [13]:
card['under_weight'].value_counts()

0    69350
1      650
Name: under_weight, dtype: int64

In [14]:
card['over_weight'].value_counts()

1    43560
0    26440
Name: over_weight, dtype: int64

## 1.3_ Cholesterol and Glucose

For the "cholesterol" and "gluc" columns, we have categorical values:
 - 1 : normal
 - 2: above normal
 - 3: well above normal
 
We will simply "group" categories 2 & 3 to have for the __"high_chol"__ or __"high_gluc"__ column:
 - __1__ if the value was 2 or 3
 - __0__ if the value was 1

In [15]:
card['high_chol'] = card['cholesterol'].apply(lambda c: 0 if (c==1) else 1)
card['high_gluc'] = card['gluc'].apply(lambda g: 0 if (g==1) else 1)
card.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_in_year,gender_f,blood_pressure,BMI,under_weight,over_weight,high_chol,high_gluc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,0,1,21.96712,0,0,0,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,1,1,34.927679,0,1,1,0
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,1,1,23.507805,0,0,1,0
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,0,1,28.710479,0,1,0,0
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,1,0,23.011177,0,0,0,0


In [16]:
card_diseases = card[['age_in_year', 'gender_f', 'under_weight', 'over_weight', 'high_chol', 'high_gluc', 'smoke', 'alco', 'active', 'cardio']].copy()
card_diseases.head()

Unnamed: 0_level_0,age_in_year,gender_f,under_weight,over_weight,high_chol,high_gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,50,0,0,0,0,0,0,0,1,0
1,55,1,0,1,1,0,0,0,1,1
2,51,1,0,0,1,0,0,0,0,1
3,48,0,0,1,0,0,0,0,1,1
4,47,1,0,0,0,0,0,0,0,0


In [17]:
card_diseases.to_csv(r"Data/card_diseases_viz_1.csv", index = False)

## 1.4_ Age

We have ages ranging from 29 to 64 years. Hence we will divide people into 4 age catgories:
   - age ≤ 39 years old
   - 40 ≤ age ≤ 49 years old
   - 50 ≤ age ≤ 59 years old
   - age ≥ 60 years old

In [18]:
card_diseases['age_in_year'].value_counts()

55    3927
53    3868
57    3686
56    3607
54    3605
59    3576
49    3417
58    3409
51    3368
52    3279
50    3216
60    3200
63    2736
61    2728
62    2199
47    2197
64    2187
45    2087
43    2031
41    1903
48    1811
39    1780
46    1625
40    1622
44    1514
42    1418
29       3
30       1
Name: age_in_year, dtype: int64

In [19]:
max(card_diseases['age_in_year'])

64

In [20]:
card_diseases['age_in_year'].min()

29

In [21]:
def categorize_age(age):
    if(age < 40):
        return "≤ 39"
    elif(age < 50):
        return "40-49"
    elif(age < 60):
        return "50-59"
    else: 
        return "≥ 60"

In [22]:
card_diseases['age_cat'] = card_diseases['age_in_year'].apply(lambda a: categorize_age(a))
card_diseases.head()

Unnamed: 0_level_0,age_in_year,gender_f,under_weight,over_weight,high_chol,high_gluc,smoke,alco,active,cardio,age_cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,50,0,0,0,0,0,0,0,1,0,50-59
1,55,1,0,1,1,0,0,0,1,1,50-59
2,51,1,0,0,1,0,0,0,0,1,50-59
3,48,0,0,1,0,0,0,0,1,1,40-49
4,47,1,0,0,0,0,0,0,0,0,40-49


# 2. Counting the occurrences

To be able to do the chart we want for the first viusalization, we need to know the count and/or proportion of each sub-group.

In [23]:
total_count = len(card_diseases)
total_count

70000

In [50]:
def create_occ_df(card_diseases, factors):
    source_target = pd.DataFrame(columns = ['from', 'to', 'value', 'prop'])
    for (factor, pair) in factors.items():
        print('-'*10 + factor + '-'*10)
        new_df_counts = card_diseases[[factor, 'age_cat', 'cardio']].copy()
        new_df_counts = new_df_counts.groupby([factor, 'age_cat', 'cardio']).agg({'cardio':'count'})
        new_df_counts.rename(columns = {'cardio': 'count'}, inplace=True)
        new_df_counts['prop'] = new_df_counts['count'].apply(lambda x: round((x*100.0)/total_count, 2))
        new_df_counts = new_df_counts.reset_index()
        new_df_counts[factor] = new_df_counts[factor].apply(lambda x: pair[0] if(x) else pair[1])
        new_df_counts['cardio'] = new_df_counts['cardio'].apply(lambda x: "CVD" if(x) else 'no CVD')
        new_df_counts['category'] = new_df_counts.apply(lambda row: str(row[factor]) + ">" + str(row['age_cat']) + ">" + str(row['cardio']), axis=1)
        new_df_counts2 = prep_viz(new_df_counts, factor, pair)
        source_target = pd.concat([source_target, new_df_counts2])
        new_df_counts['prop'] = new_df_counts['prop'].apply(lambda x: str(x)+'%')
        print(new_df_counts)
        new_df_counts.to_csv(r"Data/"+factor+"_counts.csv", index = False)
        print('.')
    print(source_target.head(30))
    source_target = source_target.groupby(['from', 'to', 'category']).agg({'value':'sum', 'prop':'sum'}).reset_index()
    source_target['prop'] = source_target['prop'].apply(lambda x: round(x, 2))
    print(source_target.head(30))
    source_target.to_csv(r"Data/source_target.csv", index = False)

In [51]:
def prep_viz(df_counts, factor, pair):
    df_factor_to_age = df_counts[[factor, 'age_cat', 'count', 'prop', 'category']].copy()
    df_age_to_cvd = df_counts[['age_cat', 'cardio', 'count', 'prop', 'category']].copy()
    
    df_factor_to_age = df_factor_to_age.groupby([factor, 'age_cat', 'category']).agg({'count':'sum', 'prop':'sum'}).reset_index()
    df_factor_to_age.rename(columns = {factor: 'from', 'age_cat':'to', 'count': 'value'}, inplace=True)
    
    df_age_to_cvd = df_age_to_cvd.groupby(['age_cat', 'cardio', 'category']).agg({'count':'sum', 'prop':'sum'}).reset_index()
    df_age_to_cvd.rename(columns = {'age_cat': 'from', 'cardio':'to', 'count': 'value'}, inplace=True)
    
    return pd.concat([df_factor_to_age, df_age_to_cvd])

In [52]:
factors = {'gender_f': ('female', 'male'), 
           'under_weight': ('under weight', 'not under weight'), 
           'over_weight': ('over weight', 'not over weight'),
           'high_chol': ('high cholesterol', 'normal cholesterol'), 
           'high_gluc': ('high glucose', 'normal glucose'), 
           'smoke': ('smoker', 'non smoker'), 
           'alco': ('alcohol consumer', 'no alcohol'), 
           'active': ('active', 'not active')}
create_occ_df(card_diseases, factors)

----------gender_f----------
   gender_f age_cat  cardio  count    prop             category
0      male   40-49  no CVD   4415   6.31%    male>40-49>no CVD
1      male   40-49     CVD   3014   4.31%       male>40-49>CVD
2      male   50-59  no CVD   5549   7.93%    male>50-59>no CVD
3      male   50-59     CVD   6144   8.78%       male>50-59>CVD
4      male    ≤ 39  no CVD    498   0.71%     male>≤ 39>no CVD
5      male    ≤ 39     CVD    177   0.25%        male>≤ 39>CVD
6      male    ≥ 60  no CVD   1645   2.35%     male>≥ 60>no CVD
7      male    ≥ 60     CVD   3028   4.33%        male>≥ 60>CVD
8    female   40-49  no CVD   7762  11.09%  female>40-49>no CVD
9    female   40-49     CVD   4434   6.33%     female>40-49>CVD
10   female   50-59  no CVD  11637  16.62%  female>50-59>no CVD
11   female   50-59     CVD  12211  17.44%     female>50-59>CVD
12   female    ≤ 39  no CVD    854   1.22%   female>≤ 39>no CVD
13   female    ≤ 39     CVD    255   0.36%      female>≤ 39>CVD
14   female

                alco age_cat  cardio  count    prop  \
0         no alcohol   40-49  no CVD  11439  16.34%   
1         no alcohol   40-49     CVD   6980   9.97%   
2         no alcohol   50-59  no CVD  16276  23.25%   
3         no alcohol   50-59     CVD  17398  24.85%   
4         no alcohol    ≤ 39  no CVD   1247   1.78%   
5         no alcohol    ≤ 39     CVD    405   0.58%   
6         no alcohol    ≥ 60  no CVD   4118   5.88%   
7         no alcohol    ≥ 60     CVD   8373  11.96%   
8   alcohol consumer   40-49  no CVD    738   1.05%   
9   alcohol consumer   40-49     CVD    468   0.67%   
10  alcohol consumer   50-59  no CVD    910    1.3%   
11  alcohol consumer   50-59     CVD    957   1.37%   
12  alcohol consumer    ≤ 39  no CVD    105   0.15%   
13  alcohol consumer    ≤ 39     CVD     27   0.04%   
14  alcohol consumer    ≥ 60  no CVD    188   0.27%   
15  alcohol consumer    ≥ 60     CVD    371   0.53%   

                         category  
0         no alcohol>40-49>n