# 2.1 Feature engineering BMI

_"BMI divides the weight by too large a number for short people and too small a number for tall people. So short people are misled into thinking that they are thinner than they are, and tall people are misled into thinking they are fatter."_

_A new formula for computing Body Mass Index that accounts for the distortions of the traditional BMI formula for shorter and taller individuals has been proposed by Nick Trefethen_

source: https://en.wikipedia.org/wiki/Body_mass_index

$$ BMI_{new} = 1.3 \frac{mass_{kg}}{height^{2.5}_{m}} $$

In [141]:
import pandas as pd

In [142]:
cardio_raw_raw = pd.read_csv("./data/cardio_train.csv", sep=";")
cardio_raw = pd.read_csv("./data/cardio_train.csv", sep=";", usecols=['height', 'weight'])
cardio_raw.head()

Unnamed: 0,height,weight
0,168,62.0
1,156,85.0
2,165,64.0
3,169,82.0
4,156,56.0


## a)

In [143]:
cardio_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   height  70000 non-null  int64  
 1   weight  70000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.1 MB


In [144]:
cardio_raw['height'].unique()

array([168, 156, 165, 169, 151, 157, 178, 158, 164, 173, 181, 172, 170,
       154, 162, 163, 153, 159, 166, 155, 160, 175, 171, 152, 187, 148,
       179, 180, 188, 185, 167, 183, 174, 176, 161, 184, 177, 182,  76,
       149, 142, 150, 144, 147, 186, 146, 141, 195, 140, 198, 145, 143,
       196, 138, 194, 190, 134, 136, 100, 120, 189, 137, 192, 122, 250,
       191, 117,  70,  97, 119, 130, 110, 193,  75, 132,  71, 135,  67,
       125, 139, 133,  74,  98, 112, 207,  68,  55,  81,  80,  64,  91,
        60, 109,  72, 197,  65, 128, 105, 108, 200, 104, 111, 113,  96,
       131,  59,  66,  99,  57], dtype=int64)

_In humans, it is sometimes defined as an adult height of less than 147 centimetres_
source: https://en.wikipedia.org/wiki/Dwarfism

In [145]:
cardio_raw.query('height < 147 & height > 100').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 639 entries, 249 to 69934
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   height  639 non-null    int64  
 1   weight  639 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 15.0 KB


In [146]:
height_for_bmi = cardio_raw.query('height > 147')['height'].apply(lambda height: height * 0.01)
height_for_bmi.head()

0    1.68
1    1.56
2    1.65
3    1.69
4    1.56
Name: height, dtype: float64

In [147]:
cardio_cleaned = cardio_raw.drop(columns=['height']).join(height_for_bmi).dropna()
cardio_cleaned

Unnamed: 0,weight,height
0,62.0,1.68
1,85.0,1.56
2,64.0,1.65
3,82.0,1.69
4,56.0,1.56
...,...,...
69995,76.0,1.68
69996,126.0,1.58
69997,105.0,1.83
69998,72.0,1.63


$$ BMI_{new} = 1.3 \frac{mass_{kg}}{height^{2.5}_{m}} $$

In [148]:
cardio_with_bmi_raw = cardio_cleaned.join(cardio_cleaned.apply(lambda row: 1.3 * row.weight / pow(row.height, 2.5), axis=1).rename('BMI'))
cardio_with_bmi_raw.head()

Unnamed: 0,weight,height,BMI
0,62.0,1.68,22.032402
1,85.0,1.56,36.353881
2,64.0,1.65,23.791042
3,82.0,1.69,28.710479
4,56.0,1.56,23.950792


In [149]:
cardio_with_bmi_raw['BMI'].max()

71.55773158363344

In [150]:
cardio_with_bmi_raw.query('BMI > 50')

Unnamed: 0,weight,height,BMI
83,110.0,1.52,50.202699
338,142.0,1.57,59.769905
389,169.0,1.65,62.823221
435,200.0,1.86,55.104998
496,117.0,1.51,54.285875
...,...,...,...
66751,126.0,1.57,53.035268
66997,123.0,1.50,58.025690
67157,131.0,1.60,52.591395
68835,125.0,1.54,55.214299


In [151]:
cardio_with_bmi_raw.query('BMI < 16 | BMI > 60').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 389 to 67492
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   weight  84 non-null     float64
 1   height  84 non-null     float64
 2   BMI     84 non-null     float64
dtypes: float64(3)
memory usage: 2.6 KB


In [152]:
cardio_with_bmi_cleaned = cardio_with_bmi_raw.drop(cardio_with_bmi_raw.query('BMI < 16 | BMI > 60').index)
cardio_with_bmi_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69020 entries, 0 to 69999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   weight  69020 non-null  float64
 1   height  69020 non-null  float64
 2   BMI     69020 non-null  float64
dtypes: float64(3)
memory usage: 2.1 MB


## b)

In [180]:
cardio_with_bmi_with_category = cardio_with_bmi_cleaned.join(pd.cut(
    cardio_with_bmi_cleaned["BMI"],
    [0, 16, 16.9, 18.4, 24.9, 29.9, 34.9, 39.9, 100],
    labels=[
        "Severe thinness",
        "Moderate thinness",
        "Mild thinness",
        "Normal range",
        "Pre-obese",
        "Class I",
        "Class II",
        "Class III",
    ],
).rename('BMI Category'))
cardio_with_bmi_with_category.head()

Unnamed: 0,weight,height,BMI,BMI Category
0,62.0,1.68,22.032402,Normal range
1,85.0,1.56,36.353881,Class II
2,64.0,1.65,23.791042,Normal range
3,82.0,1.69,28.710479,Pre-obese
4,56.0,1.56,23.950792,Normal range
