In [59]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [33]:
df = pd.read_csv("cardio_train.csv", sep=";")

In [34]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [35]:
df.tail()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1
69999,99999,20540,1,170,72.0,120,80,2,1,0,0,1,0


## Details about the dataset:

### id: id number
### age: in days
### gender: 1-women 2-men
### height: cm
### weight: kg
### ap_hi: systolic blood pressure
### ap_lo: diastolic blood pressure
### cholestrol: 1-normal, 2-above normal, 3-well above normal
### gluc: 1-normal, 2-above normal, 3-well above normal
### smoke: 0-no, 1-yes
### alco: 0-no, 1-yes
### active: 0-no, 1-yes
### cardio(target variable): 0-no, 1-yes

In [36]:
df.drop(columns=['id'], inplace=True)

In [37]:
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [38]:
df.count()

age            70000
gender         70000
height         70000
weight         70000
ap_hi          70000
ap_lo          70000
cholesterol    70000
gluc           70000
smoke          70000
alco           70000
active         70000
cardio         70000
dtype: int64

In [39]:
df.shape

(70000, 12)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.4 MB


In [41]:
df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [42]:
df["cardio"].value_counts()

cardio
0    35021
1    34979
Name: count, dtype: int64

In [43]:
# check whether the dataset contains duplicate tuples
df.duplicated().sum()

np.int64(24)

In [44]:
if df.duplicated().sum() > 0:
    df.drop_duplicates(inplace=True)

In [46]:
df.shape

(69976, 12)

In [47]:
# Converting age from days to years
df['age_years'] = (df['age'] / 365.25).round(1)
df.drop(columns=['age'], inplace=True)

In [48]:
df.head()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,2,168,62.0,110,80,1,1,0,0,1,0,50.4
1,1,156,85.0,140,90,3,1,0,0,1,1,55.4
2,1,165,64.0,130,70,3,1,0,0,0,1,51.6
3,2,169,82.0,150,100,1,1,0,0,1,1,48.2
4,1,156,56.0,100,60,1,1,0,0,0,0,47.8


In [49]:
# Removing imposiible blood pressure values
MIN_AP_HI = 60
MAX_AP_HI = 240
MIN_AP_LO = 40
MAX_AP_LO = 160

df_clean = df[(df['ap_hi'] >= MIN_AP_HI) & (df['ap_hi'] <= MAX_AP_HI) & (df['ap_lo'] >= MIN_AP_LO) & (df['ap_lo'] <= MAX_AP_LO)]

In [51]:
# Systolic blood pressure must be always greater than diastolic
df_clean = df_clean[df_clean['ap_hi'] > df_clean['ap_lo']]

In [54]:
df_clean.shape

(68645, 12)

In [55]:
df_clean.head()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,2,168,62.0,110,80,1,1,0,0,1,0,50.4
1,1,156,85.0,140,90,3,1,0,0,1,1,55.4
2,1,165,64.0,130,70,3,1,0,0,0,1,51.6
3,2,169,82.0,150,100,1,1,0,0,1,1,48.2
4,1,156,56.0,100,60,1,1,0,0,0,0,47.8


In [56]:
print(f'Number of rows lost = {len(df) - len(df_clean)}')

Number of rows lost = 1331


In [57]:
df_clean['gender'] = df_clean['gender'].replace({1: 0, 2: 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['gender'] = df_clean['gender'].replace({1: 0, 2: 1})


In [58]:
df_clean.head()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,1,168,62.0,110,80,1,1,0,0,1,0,50.4
1,0,156,85.0,140,90,3,1,0,0,1,1,55.4
2,0,165,64.0,130,70,3,1,0,0,0,1,51.6
3,1,169,82.0,150,100,1,1,0,0,1,1,48.2
4,0,156,56.0,100,60,1,1,0,0,0,0,47.8


In [None]:
scaler = StandardScaler()
