# 2.3

In [61]:
import pandas as pd

In [62]:
cardio_raw = pd.read_csv("./data/cardio_train.csv", sep=";", dtype={'gluc':'category', 'cholesterol':'category'}).drop(columns=['id', 'alco', 'active', 'smoke', 'gender'])

In [63]:
height_in_m = cardio_raw.query('height > 147 & height < 250')['height'].apply(lambda height: height * 0.01)

height_weight_cleaned = cardio_raw.drop(columns=['height']).join(height_in_m).dropna()

bmi_raw = cardio_raw.join(height_weight_cleaned.apply(lambda row: 1.3 * row.weight / pow(row.height, 2.5), axis=1).rename('BMI')).dropna()

bmi_cleaned = bmi_raw.drop(bmi_raw.query('BMI < 16 | BMI > 60').index)
cardio_raw_with_bmi = bmi_cleaned.join(pd.cut(
    bmi_cleaned["BMI"],
    [0, 18.4, 24.9, 29.9, 34.9, 39.9, 100],
    labels=[
        1,
        2,
        3,
        4,
        5,
        6,
    ],
).rename('BMI Category'))
cardio_raw_with_bmi.head(1)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,cardio,BMI,BMI Category
0,18393,168,62.0,110,80,1,1,0,22.032402,2


In [64]:
blood_pressure_cleaned = cardio_raw_with_bmi.drop(
    pd.concat(
        [
            cardio_raw_with_bmi.query("ap_lo < 50"),
            cardio_raw_with_bmi.query("ap_lo > 200"),
            cardio_raw_with_bmi.query("ap_hi < 60"),
            cardio_raw_with_bmi.query("ap_hi > 240"),
        ]
    ).index
)
def blood_pressure_category(row):
    if (row['ap_hi'] < 120) and (row['ap_lo'] < 80):
        return 1
    if (row['ap_hi'] <= 129) and (row['ap_lo'] < 80):
        return 2
    if (row['ap_hi'] <= 139) or (row['ap_lo'] <= 89):
        return 3
    if (row['ap_lo'] <= 179) or (row['ap_lo'] <= 119):
        return 4
    if (row['ap_hi'] >= 180) or (row['ap_lo'] >= 120):
        return 5

cardio_cleaned_with_new_categories = blood_pressure_cleaned.join(pd.DataFrame(blood_pressure_cleaned.apply(blood_pressure_category, axis=1).rename('Blood Pressure Category').astype('category')))

cardio_cleaned_with_new_categories.head(1)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,cardio,BMI,BMI Category,Blood Pressure Category
0,18393,168,62.0,110,80,1,1,0,22.032402,2,3


In [79]:
categorial_data_set = pd.get_dummies(cardio_cleaned_with_new_categories.drop(columns=['ap_hi', 'ap_lo', 'height', 'weight', 'BMI', 'age']), drop_first=True)
categorial_data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67798 entries, 0 to 69999
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   cardio                     67798 non-null  int64
 1   cholesterol_2              67798 non-null  uint8
 2   cholesterol_3              67798 non-null  uint8
 3   gluc_2                     67798 non-null  uint8
 4   gluc_3                     67798 non-null  uint8
 5   BMI Category_2             67798 non-null  uint8
 6   BMI Category_3             67798 non-null  uint8
 7   BMI Category_4             67798 non-null  uint8
 8   BMI Category_5             67798 non-null  uint8
 9   BMI Category_6             67798 non-null  uint8
 10  Blood Pressure Category_2  67798 non-null  uint8
 11  Blood Pressure Category_3  67798 non-null  uint8
 12  Blood Pressure Category_4  67798 non-null  uint8
 13  Blood Pressure Category_5  67798 non-null  uint8
dtypes: int64(1), uint8(13)

In [80]:
continuous_dataset = cardio_cleaned_with_new_categories.drop(columns=['Blood Pressure Category', 'BMI Category'])
continuous_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67798 entries, 0 to 69999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          67798 non-null  int64   
 1   height       67798 non-null  int64   
 2   weight       67798 non-null  float64 
 3   ap_hi        67798 non-null  int64   
 4   ap_lo        67798 non-null  int64   
 5   cholesterol  67798 non-null  category
 6   gluc         67798 non-null  category
 7   cardio       67798 non-null  int64   
 8   BMI          67798 non-null  float64 
dtypes: category(2), float64(2), int64(5)
memory usage: 6.3 MB


In [81]:
other_features_raw = pd.read_csv("./data/cardio_train.csv", sep=";", dtype={'gluc':'category', 'cholesterol':'category', 'gender':'category', 'smoke':'category', 'alco':'category', 'active':'category'}).drop(columns=['id', 'age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cardio', 'cholesterol', 'gluc'])
all_features_dataset = pd.get_dummies(cardio_cleaned_with_new_categories, drop_first=True).join(pd.get_dummies(other_features_raw, drop_first=True))
all_features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67798 entries, 0 to 69999
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        67798 non-null  int64  
 1   height                     67798 non-null  int64  
 2   weight                     67798 non-null  float64
 3   ap_hi                      67798 non-null  int64  
 4   ap_lo                      67798 non-null  int64  
 5   cardio                     67798 non-null  int64  
 6   BMI                        67798 non-null  float64
 7   cholesterol_2              67798 non-null  uint8  
 8   cholesterol_3              67798 non-null  uint8  
 9   gluc_2                     67798 non-null  uint8  
 10  gluc_3                     67798 non-null  uint8  
 11  BMI Category_2             67798 non-null  uint8  
 12  BMI Category_3             67798 non-null  uint8  
 13  BMI Category_4             67798 non-null  uin