# 2.1 Feature engineering BMI

_"BMI divides the weight by too large a number for short people and too small a number for tall people. So short people are misled into thinking that they are thinner than they are, and tall people are misled into thinking they are fatter."_

_A new formula for computing Body Mass Index that accounts for the distortions of the traditional BMI formula for shorter and taller individuals has been proposed by Nick Trefethen_

source: https://en.wikipedia.org/wiki/Body_mass_index

$$ BMI_{new} = 1.3 \frac{mass_{kg}}{height^{2.5}_{m}} $$

In [36]:
import pandas as pd

In [37]:
height_weight_raw = pd.read_csv("./data/cardio_train.csv", sep=";", usecols=['height', 'weight'])
height_weight_raw.head()

Unnamed: 0,height,weight
0,168,62.0
1,156,85.0
2,165,64.0
3,169,82.0
4,156,56.0


## 2.0

### a)

In [38]:
height_weight_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   height  70000 non-null  int64  
 1   weight  70000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.1 MB


In [39]:
height_weight_raw['height'].unique()

array([168, 156, 165, 169, 151, 157, 178, 158, 164, 173, 181, 172, 170,
       154, 162, 163, 153, 159, 166, 155, 160, 175, 171, 152, 187, 148,
       179, 180, 188, 185, 167, 183, 174, 176, 161, 184, 177, 182,  76,
       149, 142, 150, 144, 147, 186, 146, 141, 195, 140, 198, 145, 143,
       196, 138, 194, 190, 134, 136, 100, 120, 189, 137, 192, 122, 250,
       191, 117,  70,  97, 119, 130, 110, 193,  75, 132,  71, 135,  67,
       125, 139, 133,  74,  98, 112, 207,  68,  55,  81,  80,  64,  91,
        60, 109,  72, 197,  65, 128, 105, 108, 200, 104, 111, 113,  96,
       131,  59,  66,  99,  57], dtype=int64)

_In humans, it is sometimes defined as an adult height of less than 147 centimetres_
source: https://en.wikipedia.org/wiki/Dwarfism

In [40]:
height_weight_raw.query('height < 147 & height > 100').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 639 entries, 249 to 69934
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   height  639 non-null    int64  
 1   weight  639 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 15.0 KB


In [41]:
height_in_m = height_weight_raw.query('height > 147')['height'].apply(lambda height: height * 0.01)
height_in_m.head()

0    1.68
1    1.56
2    1.65
3    1.69
4    1.56
Name: height, dtype: float64

In [42]:
height_weight_cleaned = height_weight_raw.drop(columns=['height']).join(height_in_m).dropna()
height_weight_cleaned

Unnamed: 0,weight,height
0,62.0,1.68
1,85.0,1.56
2,64.0,1.65
3,82.0,1.69
4,56.0,1.56
...,...,...
69995,76.0,1.68
69996,126.0,1.58
69997,105.0,1.83
69998,72.0,1.63


$$ BMI_{new} = 1.3 \frac{mass_{kg}}{height^{2.5}_{m}} $$

In [43]:
bmi_raw = height_weight_cleaned.join(height_weight_cleaned.apply(lambda row: 1.3 * row.weight / pow(row.height, 2.5), axis=1).rename('BMI'))
bmi_raw.head()

Unnamed: 0,weight,height,BMI
0,62.0,1.68,22.032402
1,85.0,1.56,36.353881
2,64.0,1.65,23.791042
3,82.0,1.69,28.710479
4,56.0,1.56,23.950792


In [44]:
bmi_raw['BMI'].max()

71.55773158363344

In [45]:
bmi_raw.query('BMI > 50')

Unnamed: 0,weight,height,BMI
83,110.0,1.52,50.202699
338,142.0,1.57,59.769905
389,169.0,1.65,62.823221
435,200.0,1.86,55.104998
496,117.0,1.51,54.285875
...,...,...,...
66751,126.0,1.57,53.035268
66997,123.0,1.50,58.025690
67157,131.0,1.60,52.591395
68835,125.0,1.54,55.214299


In [46]:
bmi_raw.query('BMI < 16 | BMI > 60').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 389 to 67492
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   weight  84 non-null     float64
 1   height  84 non-null     float64
 2   BMI     84 non-null     float64
dtypes: float64(3)
memory usage: 2.6 KB


In [47]:
bmi_cleaned = bmi_raw.drop(bmi_raw.query('BMI < 16 | BMI > 60').index)
bmi_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69020 entries, 0 to 69999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   weight  69020 non-null  float64
 1   height  69020 non-null  float64
 2   BMI     69020 non-null  float64
dtypes: float64(3)
memory usage: 2.1 MB


### b)

In [48]:
bmi_with_category = bmi_cleaned.join(pd.cut(
    bmi_cleaned["BMI"],
    [0, 16, 16.9, 18.4, 24.9, 29.9, 34.9, 39.9, 100],
    labels=[
        "Severe thinness",
        "Moderate thinness",
        "Mild thinness",
        "Normal range",
        "Pre-obese",
        "Class I",
        "Class II",
        "Class III",
    ],
).rename('BMI Category'))
bmi_with_category.head()

Unnamed: 0,weight,height,BMI,BMI Category
0,62.0,1.68,22.032402,Normal range
1,85.0,1.56,36.353881,Class II
2,64.0,1.65,23.791042,Normal range
3,82.0,1.69,28.710479,Pre-obese
4,56.0,1.56,23.950792,Normal range


## 1

In [49]:
blood_pressure_raw = pd.read_csv("./data/cardio_train.csv", sep=";", usecols=['ap_hi', 'ap_lo'])
blood_pressure_raw.head()

Unnamed: 0,ap_hi,ap_lo
0,110,80
1,140,90
2,130,70
3,150,100
4,100,60


In [54]:
blood_pressure_raw.describe().T.drop(columns=['count'])

Unnamed: 0,mean,std,min,25%,50%,75%,max
ap_hi,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0


In [56]:
blood_pressure_raw['ap_hi'].sort_values().unique()

array([ -150,  -140,  -120,  -115,  -100,     1,     7,    10,    11,
          12,    13,    14,    15,    16,    17,    20,    24,    60,
          70,    80,    85,    90,    93,    95,    96,    97,    99,
         100,   101,   102,   103,   104,   105,   106,   107,   108,
         109,   110,   111,   112,   113,   114,   115,   116,   117,
         118,   119,   120,   121,   122,   123,   124,   125,   126,
         127,   128,   129,   130,   131,   132,   133,   134,   135,
         136,   137,   138,   139,   140,   141,   142,   143,   144,
         145,   146,   147,   148,   149,   150,   151,   152,   153,
         154,   155,   156,   157,   158,   159,   160,   161,   162,
         163,   164,   165,   166,   167,   168,   169,   170,   171,
         172,   173,   174,   175,   176,   177,   178,   179,   180,
         181,   184,   185,   187,   188,   190,   191,   193,   195,
         196,   197,   199,   200,   202,   207,   210,   215,   220,
         230,   240,

In [76]:
blood_pressure_raw.query('ap_hi < 60')['ap_hi'].value_counts().sort_index()

-150     1
-140     1
-120     2
-115     1
-100     2
 1       2
 7       1
 10      7
 11     28
 12     76
 13     15
 14     29
 15     12
 16      3
 17      3
 20      4
 24      1
Name: ap_hi, dtype: int64

In [80]:
blood_pressure_raw.query('ap_hi > 240')['ap_hi'].value_counts().sort_index()

309      1
401      1
701      1
806      1
902      1
906      6
907      3
909      1
960      1
1110     1
1130     1
1202     1
1205     1
1300     2
1400     3
1409     1
1420     2
1500     1
1620     1
2000     1
11020    1
11500    1
13010    2
14020    4
16020    1
Name: ap_hi, dtype: int64

In [65]:
blood_pressure_raw['ap_lo'].sort_values().unique()

array([  -70,     0,     1,     6,     7,     8,     9,    10,    15,
          20,    30,    40,    45,    49,    50,    52,    53,    54,
          55,    56,    57,    58,    59,    60,    61,    62,    63,
          64,    65,    66,    67,    68,    69,    70,    71,    72,
          73,    74,    75,    76,    77,    78,    79,    80,    81,
          82,    83,    84,    85,    86,    87,    88,    89,    90,
          91,    92,    93,    94,    95,    96,    97,    98,    99,
         100,   101,   102,   103,   104,   105,   106,   107,   108,
         109,   110,   111,   112,   113,   114,   115,   118,   119,
         120,   121,   122,   125,   126,   130,   135,   140,   150,
         160,   170,   180,   182,   190,   585,   602,   700,   708,
         709,   710,   800,   801,   802,   809,   810,   820,   850,
         870,   880,   900,   901,   902,   910,  1000,  1001,  1002,
        1003,  1007,  1008,  1011,  1022,  1033,  1044,  1077,  1088,
        1100,  1101,

In [119]:
blood_pressure_raw.query('ap_lo > 200')['ap_lo'].value_counts().sort_index()

585      1
602      1
700      1
708      2
709      2
        ..
9011     2
9100     1
9800     1
10000    3
11000    1
Name: ap_lo, Length: 62, dtype: int64

In [96]:
blood_pressure_raw.query('ap_lo < 61')['ap_lo'].value_counts().sort_index()

-70       1
 0       21
 1        1
 6        2
 7        2
 8        2
 9        1
 10       7
 15       1
 20      15
 30       6
 40      17
 45       2
 49       2
 50      56
 52       2
 53       3
 54       1
 55       4
 56       1
 57       4
 58       4
 59      20
 60    2727
Name: ap_lo, dtype: int64

In [127]:
blood_pressure_cleaned = blood_pressure_raw.drop(
    pd.concat(
        [
            blood_pressure_raw.query("ap_lo < 50"),
            blood_pressure_raw.query("ap_lo > 200"),
            blood_pressure_raw.query("ap_hi < 60"),
            blood_pressure_raw.query("ap_hi > 240"),
        ]
    ).index
)
blood_pressure_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68755 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ap_hi   68755 non-null  int64
 1   ap_lo   68755 non-null  int64
dtypes: int64(2)
memory usage: 1.6 MB


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.combine.html

In [142]:
pd.cut(
    blood_pressure_cleaned["ap_lo"],
    [0, 80, 90, 120, 1000],
    labels=[
        "Healthy",
        "Stage 1 hypertension",
        "Stage 2 hypertension",
        "Hypertension crisis",
    ],
)

0                     Healthy
1        Stage 1 hypertension
2                     Healthy
3        Stage 2 hypertension
4                     Healthy
                 ...         
69995                 Healthy
69996    Stage 1 hypertension
69997    Stage 1 hypertension
69998                 Healthy
69999                 Healthy
Name: ap_lo, Length: 68755, dtype: category
Categories (4, object): ['Healthy' < 'Stage 1 hypertension' < 'Stage 2 hypertension' < 'Hypertension crisis']

In [143]:
pd.cut(
    blood_pressure_cleaned["ap_hi"],
    [0, 120, 130, 140, 180, 1000],
    labels=[
        "Healthy",
        "Elevated",
        "Stage 1 hypertension",
        "Stage 2 hypertension",
        "Hypertension crisis",
    ],
)

0                     Healthy
1        Stage 1 hypertension
2                    Elevated
3        Stage 2 hypertension
4                     Healthy
                 ...         
69995                 Healthy
69996    Stage 1 hypertension
69997    Stage 2 hypertension
69998    Stage 1 hypertension
69999                 Healthy
Name: ap_hi, Length: 68755, dtype: category
Categories (5, object): ['Healthy' < 'Elevated' < 'Stage 1 hypertension' < 'Stage 2 hypertension' < 'Hypertension crisis']

In [139]:
healthy_pressure = blood_pressure_cleaned.query('ap_lo < 80 & ap_hi < 120')
elevated_pressure = blood_pressure_cleaned.query('ap_lo > 80  & ap_hi > 120 & ap_hi < 130')
stage_1_pressure = blood_pressure_cleaned.query('ap_lo > 80 & ap_lo < 89  & ap_hi > 130 & ap_hi < 139')
stage_2_pressure = blood_pressure_cleaned.query('ap_lo > 90 & ap_lo < 180 & ap_hi > 140 & ap_hi < 120')
hypertension_pressure = blood_pressure_cleaned.query('ap_lo > 180 & ap_hi > 120')
hypertension_pressure

Unnamed: 0,ap_hi,ap_lo
38022,196,182
49303,170,190
63924,130,190


In [None]:
# blood_pressure_with_category = blood_pressure_cleaned.join(pd.cut(
#     blood_pressure_cleaned["changeme"],
#     [0, 16, 16.9, 18.4, 24.9, 29.9, 34.9, 39.9, 100],
#     labels=[
#         "Severe thinness",
#         "Moderate thinness",
#         "Mild thinness",
#         "Normal range",
#         "Pre-obese",
#         "Class I",
#         "Class II",
#         "Class III",
#     ],
# ).rename('BMI Category'))
# bmi_with_category.head()