# 2.3

In [1]:
import pandas as pd

In [2]:
cardio_raw = pd.get_dummies(pd.read_csv("./data/cardio_train.csv", sep=";", dtype={'gender':'category'}).drop(columns=['id']), drop_first=True)
cardio_raw.head(1)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2
0,18393,168,62.0,110,80,1,1,0,0,1,0,1


In [3]:
height_in_m = cardio_raw.query('height > 147 & height < 250')['height'].apply(lambda height: height * 0.01)

height_weight_cleaned = cardio_raw.drop(columns=['height']).join(height_in_m).dropna()

bmi_raw = cardio_raw.join(height_weight_cleaned.apply(lambda row: 1.3 * row.weight / pow(row.height, 2.5), axis=1).rename('BMI')).dropna()

bmi_cleaned = bmi_raw.drop(bmi_raw.query('BMI < 16 | BMI > 60').index)
cardio_raw_with_bmi = bmi_cleaned.join(pd.cut(
    bmi_cleaned["BMI"],
    [0, 18.4, 24.9, 29.9, 34.9, 39.9, 100],
    labels=[
        1,
        2,
        3,
        4,
        5,
        6,
    ],
).rename('BMI Category'))
cardio_raw_with_bmi.head(1)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2,BMI,BMI Category
0,18393,168,62.0,110,80,1,1,0,0,1,0,1,22.032402,2


In [4]:
blood_pressure_cleaned = cardio_raw_with_bmi.drop(
    pd.concat(
        [
            cardio_raw_with_bmi.query("ap_lo < 50"),
            cardio_raw_with_bmi.query("ap_lo > 200"),
            cardio_raw_with_bmi.query("ap_hi < 60"),
            cardio_raw_with_bmi.query("ap_hi > 240"),
        ]
    ).index
)
def blood_pressure_category(row):
    if (row['ap_hi'] < 120) and (row['ap_lo'] < 80):
        return 1
    if (row['ap_hi'] <= 129) and (row['ap_lo'] < 80):
        return 2
    if (row['ap_hi'] <= 139) or (row['ap_lo'] <= 89):
        return 3
    if (row['ap_lo'] <= 179) or (row['ap_lo'] <= 119):
        return 4
    if (row['ap_hi'] >= 180) or (row['ap_lo'] >= 120):
        return 5

cardio_cleaned_with_new_categories = blood_pressure_cleaned.join(pd.DataFrame(blood_pressure_cleaned.apply(blood_pressure_category, axis=1).rename('Blood Pressure Category').astype('category')))

cardio_cleaned_with_new_categories.head(1)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2,BMI,BMI Category,Blood Pressure Category
0,18393,168,62.0,110,80,1,1,0,0,1,0,1,22.032402,2,3


In [5]:
categorial_dataset = pd.get_dummies(cardio_cleaned_with_new_categories.drop(columns=['ap_hi', 'ap_lo', 'height', 'weight', 'BMI']), drop_first=True)
categorial_dataset.head()

Unnamed: 0,age,cholesterol,gluc,smoke,alco,active,cardio,gender_2,BMI Category_2,BMI Category_3,BMI Category_4,BMI Category_5,BMI Category_6,Blood Pressure Category_2,Blood Pressure Category_3,Blood Pressure Category_4,Blood Pressure Category_5
0,18393,1,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0
1,20228,3,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0
2,18857,3,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0
3,17623,1,1,0,0,1,1,1,0,1,0,0,0,0,0,1,0
4,17474,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [6]:
continuous_dataset = cardio_cleaned_with_new_categories.drop(columns=['Blood Pressure Category', 'BMI Category', 'height', 'weight'])
continuous_dataset.head()

Unnamed: 0,age,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2,BMI
0,18393,110,80,1,1,0,0,1,0,1,22.032402
1,20228,140,90,3,1,0,0,1,1,0,36.353881
2,18857,130,70,3,1,0,0,0,1,0,23.791042
3,17623,150,100,1,1,0,0,1,1,1,28.710479
4,17474,100,60,1,1,0,0,0,0,0,23.950792


In [7]:
full_dataset = cardio_cleaned_with_new_categories

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,gender_2,BMI,BMI Category,Blood Pressure Category
0,18393,168,62.0,110,80,1,1,0,0,1,0,1,22.032402,2,3
1,20228,156,85.0,140,90,3,1,0,0,1,1,0,36.353881,5,4
2,18857,165,64.0,130,70,3,1,0,0,0,1,0,23.791042,2,3
3,17623,169,82.0,150,100,1,1,0,0,1,1,1,28.710479,3,4
4,17474,156,56.0,100,60,1,1,0,0,0,0,0,23.950792,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,168,76.0,120,80,1,1,1,0,1,0,1,27.007460,3,3
69996,22601,158,126.0,140,90,2,2,0,0,1,1,0,52.200081,6,4
69997,19066,183,105.0,180,90,3,1,0,1,0,1,1,30.130402,4,4
69998,22431,163,72.0,135,80,1,2,0,0,0,1,0,27.593503,3,3


In [8]:
reduced_dataset = cardio_cleaned_with_new_categories.drop(columns=['Blood Pressure Category', 'BMI Category', 'height', 'weight', 'active', 'alco', 'smoke', 'gender_2']).head()

Unnamed: 0,age,ap_hi,ap_lo,cholesterol,gluc,cardio,BMI
0,18393,110,80,1,1,0,22.032402
1,20228,140,90,3,1,1,36.353881
2,18857,130,70,3,1,1,23.791042
3,17623,150,100,1,1,1,28.710479
4,17474,100,60,1,1,0,23.950792
