## Import Required Libraries

In [6]:
import pandas as pd

## Load the Dataset

In [7]:
df = pd.read_csv("cardio_train.csv",sep=";")

## Initial Data Exploration

In [8]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [9]:
df.shape

(70000, 13)

In [10]:
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [11]:
df.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [12]:
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [13]:
df.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [14]:
df['cardio'].value_counts()

cardio
0    35021
1    34979
Name: count, dtype: int64

In [15]:
df['cardio'].value_counts(normalize=True) * 100

cardio
0    50.03
1    49.97
Name: proportion, dtype: float64

## Drop Unnecessary Columns (Remove id)

In [16]:
df = df.drop(columns=['id'])

In [17]:
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


## Convert Age from Days to Years

In [18]:
df['age'] = (df['age'] / 365).astype(int)

In [19]:
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,47,1,156,56.0,100,60,1,1,0,0,0,0


## Rename Columns for Better Readability

In [20]:
df = df.rename(columns={
    'ap_hi':'high_bp',
    'ap_lo':'low_bp',
    'cholesterol':'cholesterol_level',
    'gluc':'glucose_level'
})

In [21]:
df.head()

Unnamed: 0,age,gender,height,weight,high_bp,low_bp,cholesterol_level,glucose_level,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,47,1,156,56.0,100,60,1,1,0,0,0,0


## Create New Feature: BMI (Body Mass Index)

In [22]:
df['height_m'] = df['height'] / 100
df['BMI'] = df['weight'] / (df['height_m']**2)
df.drop(columns=['height_m'],inplace=True)

In [23]:
df[['height', 'weight', 'BMI']].head()

Unnamed: 0,height,weight,BMI
0,168,62.0,21.96712
1,156,85.0,34.927679
2,165,64.0,23.507805
3,169,82.0,28.710479
4,156,56.0,23.011177


## Create New Feature: Pulse Pressure

In [24]:
df['pulse_pressure'] = df['high_bp'] - df['low_bp']

In [25]:
df[['high_bp', 'low_bp', 'pulse_pressure']].head()

Unnamed: 0,high_bp,low_bp,pulse_pressure
0,110,80,30
1,140,90,50
2,130,70,60
3,150,100,50
4,100,60,40


## Remove Invalid and Impossible Blood Pressure Values

In [26]:
# high_bp (systolic BP) >= low_bp (diastolic BP)
df = df[df['high_bp'] >= df['low_bp']]

In [27]:
# valid systolic range
df = df[(df['high_bp'] >= 80) & (df['high_bp'] <= 250)]

In [28]:
# valid diastolic range
df = df[(df['low_bp'] >= 40) & (df['low_bp'] <= 200)]

In [29]:
df.shape

(68669, 14)

In [30]:
df[['high_bp','low_bp']].head()

Unnamed: 0,high_bp,low_bp
0,110,80
1,140,90
2,130,70
3,150,100
4,100,60


## Remove Height Outliers (130–210 cm)

In [31]:
# Remove height outliers
df = df[(df['height'] >= 130) & (df['height'] <= 210)]

## Remove Weight Outliers (40–180 kg)

In [32]:
# Remove weight outliers
df = df[(df['weight'] >= 40) & (df['weight'] <= 180)]

In [33]:
df.shape

(68530, 14)

In [34]:
df[['height', 'weight']].describe()

Unnamed: 0,height,weight
count,68530.0,68530.0
mean,164.445907,74.131613
std,7.838671,14.241453
min,130.0,40.0
25%,159.0,65.0
50%,165.0,72.0
75%,170.0,82.0
max,207.0,180.0


## Remove BMI Outliers (10–60)

In [35]:
df = df[(df['BMI'] >= 12) & (df['BMI'] <= 60)]

In [36]:
df['BMI'].describe()

count    68512.000000
mean        27.438444
std          5.160134
min         13.520822
25%         23.875115
50%         26.346494
75%         30.116213
max         59.523810
Name: BMI, dtype: float64

## Remove Duplicate Records

In [37]:
df.drop_duplicates(inplace=True)

In [38]:
df.duplicated().sum()

np.int64(0)

In [39]:
df.shape

(65306, 14)

## Encode Binary Column (Gender: 0 = Female, 1 = Male)

In [40]:
df['gender'] = df['gender'].map({1:0,2:1})

## One-Hot Encode Cholesterol and Glucose Levels

In [41]:
df = pd.get_dummies(
        df,
        columns=['cholesterol_level', 'glucose_level'],
        drop_first=True
     )

In [42]:
df = df.astype({ 
    'cholesterol_level_2': 'int',
    'cholesterol_level_3': 'int',
    'glucose_level_2': 'int',
    'glucose_level_3': 'int'
})

In [43]:
df.head()

Unnamed: 0,age,gender,height,weight,high_bp,low_bp,smoke,alco,active,cardio,BMI,pulse_pressure,cholesterol_level_2,cholesterol_level_3,glucose_level_2,glucose_level_3
0,50,1,168,62.0,110,80,0,0,1,0,21.96712,30,0,0,0,0
1,55,0,156,85.0,140,90,0,0,1,1,34.927679,50,0,1,0,0
2,51,0,165,64.0,130,70,0,0,0,1,23.507805,60,0,1,0,0
3,48,1,169,82.0,150,100,0,0,1,1,28.710479,50,0,0,0,0
4,47,0,156,56.0,100,60,0,0,0,0,23.011177,40,0,0,0,0


## Final Dataset Verification

In [44]:
df['smoke'].unique()
df['alco'].unique()
df['active'].unique()
df['cardio'].unique()

array([0, 1])

In [45]:
df.shape

(65306, 16)

In [46]:
df.head()

Unnamed: 0,age,gender,height,weight,high_bp,low_bp,smoke,alco,active,cardio,BMI,pulse_pressure,cholesterol_level_2,cholesterol_level_3,glucose_level_2,glucose_level_3
0,50,1,168,62.0,110,80,0,0,1,0,21.96712,30,0,0,0,0
1,55,0,156,85.0,140,90,0,0,1,1,34.927679,50,0,1,0,0
2,51,0,165,64.0,130,70,0,0,0,1,23.507805,60,0,1,0,0
3,48,1,169,82.0,150,100,0,0,1,1,28.710479,50,0,0,0,0
4,47,0,156,56.0,100,60,0,0,0,0,23.011177,40,0,0,0,0


In [47]:
df.tail()

Unnamed: 0,age,gender,height,weight,high_bp,low_bp,smoke,alco,active,cardio,BMI,pulse_pressure,cholesterol_level_2,cholesterol_level_3,glucose_level_2,glucose_level_3
69994,57,0,165,80.0,150,80,0,0,1,1,29.384757,70,0,0,0,0
69996,61,0,158,126.0,140,90,0,0,1,1,50.472681,50,1,0,1,0
69997,52,1,183,105.0,180,90,0,1,0,1,31.353579,90,0,1,0,0
69998,61,0,163,72.0,135,80,0,0,0,1,27.099251,55,0,0,1,0
69999,56,0,170,72.0,120,80,0,0,1,0,24.913495,40,1,0,0,0


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65306 entries, 0 to 69999
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  65306 non-null  int64  
 1   gender               65306 non-null  int64  
 2   height               65306 non-null  int64  
 3   weight               65306 non-null  float64
 4   high_bp              65306 non-null  int64  
 5   low_bp               65306 non-null  int64  
 6   smoke                65306 non-null  int64  
 7   alco                 65306 non-null  int64  
 8   active               65306 non-null  int64  
 9   cardio               65306 non-null  int64  
 10  BMI                  65306 non-null  float64
 11  pulse_pressure       65306 non-null  int64  
 12  cholesterol_level_2  65306 non-null  int64  
 13  cholesterol_level_3  65306 non-null  int64  
 14  glucose_level_2      65306 non-null  int64  
 15  glucose_level_3      65306 non-null  int6

## Export the Preprocessed Dataset to CSV

In [51]:
df.to_csv("cardio_cleaned.csv",index=False)