In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("/kaggle/input/heat-disease-predication-dataset/synthetic_heart_disease_dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    50000 non-null  int64  
 1   Gender                 50000 non-null  object 
 2   Weight                 50000 non-null  int64  
 3   Height                 50000 non-null  int64  
 4   BMI                    50000 non-null  float64
 5   Smoking                50000 non-null  object 
 6   Alcohol_Intake         29891 non-null  object 
 7   Physical_Activity      50000 non-null  object 
 8   Diet                   50000 non-null  object 
 9   Stress_Level           50000 non-null  object 
 10  Hypertension           50000 non-null  int64  
 11  Diabetes               50000 non-null  int64  
 12  Hyperlipidemia         50000 non-null  int64  
 13  Family_History         50000 non-null  int64  
 14  Previous_Heart_Attack  50000 non-null  int64  
 15  Sy

In [4]:
df.isnull().sum()

Age                          0
Gender                       0
Weight                       0
Height                       0
BMI                          0
Smoking                      0
Alcohol_Intake           20109
Physical_Activity            0
Diet                         0
Stress_Level                 0
Hypertension                 0
Diabetes                     0
Hyperlipidemia               0
Family_History               0
Previous_Heart_Attack        0
Systolic_BP                  0
Diastolic_BP                 0
Heart_Rate                   0
Blood_Sugar_Fasting          0
Cholesterol_Total            0
Heart_Disease                0
dtype: int64

**Alcohol_Iintake has around 40% of NULL values, so it's better to delete the column.**

In [5]:
df.drop(columns = "Alcohol_Intake", inplace = True)

In [6]:
df.head()

Unnamed: 0,Age,Gender,Weight,Height,BMI,Smoking,Physical_Activity,Diet,Stress_Level,Hypertension,Diabetes,Hyperlipidemia,Family_History,Previous_Heart_Attack,Systolic_BP,Diastolic_BP,Heart_Rate,Blood_Sugar_Fasting,Cholesterol_Total,Heart_Disease
0,48,Male,78,157,26.4,Never,Sedentary,Healthy,Medium,0,0,1,1,0,104,99,71,165,200,0
1,35,Female,73,163,33.0,Never,Active,Average,High,1,0,1,1,0,111,72,60,145,206,0
2,79,Female,88,152,32.3,Never,Moderate,Average,Medium,0,0,0,1,0,116,102,78,148,208,0
3,75,Male,106,171,37.4,Never,Moderate,Average,Low,0,0,1,0,0,171,92,109,105,290,1
4,34,Female,65,191,18.5,Current,Sedentary,Healthy,Low,1,1,0,0,0,164,67,108,116,220,1


**Using OrdinalEncoder as most of the column has ordinal data**

In [7]:
ordinal_encoder1 = OrdinalEncoder(categories = [["Male", "Female"]])
df["Gender"] = ordinal_encoder1.fit_transform(df[["Gender"]])

ordinal_encoder2 = OrdinalEncoder(categories = [["Never", "Current", "Former"]])
df["Smoking"] = ordinal_encoder2.fit_transform(df[["Smoking"]])


ordinal_encoder3 = OrdinalEncoder(categories = [["Moderate", "Sedentary", "Active"]])
df["Physical_Activity"] = ordinal_encoder3.fit_transform(df[["Physical_Activity"]])

ordinal_encoder4 = OrdinalEncoder(categories = [["Unhealthy", "Average", "Healthy"]])
df["Diet"] = ordinal_encoder4.fit_transform(df[["Diet"]])

ordinal_encoder5 = OrdinalEncoder(categories = [["Low", "Medium", "High"]])
df["Stress_Level"] = ordinal_encoder5.fit_transform(df[["Stress_Level"]])

In [8]:
df.sample(5)

Unnamed: 0,Age,Gender,Weight,Height,BMI,Smoking,Physical_Activity,Diet,Stress_Level,Hypertension,Diabetes,Hyperlipidemia,Family_History,Previous_Heart_Attack,Systolic_BP,Diastolic_BP,Heart_Rate,Blood_Sugar_Fasting,Cholesterol_Total,Heart_Disease
34477,78,0.0,77,167,32.8,0.0,1.0,1.0,1.0,1,1,0,0,0,175,95,92,135,272,1
16244,74,1.0,52,166,35.7,0.0,0.0,2.0,0.0,1,0,1,1,0,100,61,84,157,205,1
7293,37,0.0,107,171,19.9,0.0,0.0,1.0,2.0,0,0,0,1,0,115,96,99,96,169,0
47291,56,1.0,118,199,36.5,0.0,1.0,2.0,2.0,0,0,0,0,1,165,111,83,129,282,1
20017,76,0.0,62,180,19.9,2.0,0.0,1.0,1.0,0,0,0,0,0,163,78,91,120,197,0


**Checking For Outliers**

In [9]:
print(pd.DataFrame({"max": df.max(), "min": df.min()}))

                         max    min
Age                     79.0   30.0
Gender                   1.0    0.0
Weight                 119.0   50.0
Height                 199.0  150.0
BMI                     40.0   18.0
Smoking                  2.0    0.0
Physical_Activity        2.0    0.0
Diet                     2.0    0.0
Stress_Level             2.0    0.0
Hypertension             1.0    0.0
Diabetes                 1.0    0.0
Hyperlipidemia           1.0    0.0
Family_History           1.0    0.0
Previous_Heart_Attack    1.0    0.0
Systolic_BP            179.0  100.0
Diastolic_BP           119.0   60.0
Heart_Rate             109.0   60.0
Blood_Sugar_Fasting    179.0   70.0
Cholesterol_Total      299.0  150.0
Heart_Disease            1.0    0.0


**Downcasting columns to lower data types to save memory space**

In [10]:
int_cols = df.select_dtypes(int).columns.tolist()

for col in int_cols:
    df[col] = df[col].astype(np.int8)

float_cols = df.select_dtypes(float).columns.tolist()

for col in float_cols:
    df[col] = df[col].astype(np.float32)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    50000 non-null  int8   
 1   Gender                 50000 non-null  float32
 2   Weight                 50000 non-null  int8   
 3   Height                 50000 non-null  int8   
 4   BMI                    50000 non-null  float32
 5   Smoking                50000 non-null  float32
 6   Physical_Activity      50000 non-null  float32
 7   Diet                   50000 non-null  float32
 8   Stress_Level           50000 non-null  float32
 9   Hypertension           50000 non-null  int8   
 10  Diabetes               50000 non-null  int8   
 11  Hyperlipidemia         50000 non-null  int8   
 12  Family_History         50000 non-null  int8   
 13  Previous_Heart_Attack  50000 non-null  int8   
 14  Systolic_BP            50000 non-null  int8   
 15  Di

**Memory Space reducved to 1.8MB from 8MB (77.5% drop in memory space)**

---

In [12]:
X = df.drop(columns = "Heart_Disease")
Y = df["Heart_Disease"]

**Target Column is quite balanced but still using StratifiedKFold for splitting data just for cautious**

In [13]:
n_splits = [5, 10, 15, 20, 25, 30]

for split in n_splits:
    cv = StratifiedKFold(n_splits = split, shuffle = True, random_state = 42)
    
    accuries = []
    
    for train_idx, test_idx in cv.split(X, Y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
        model = XGBClassifier(
            n_estimators=500,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='logloss',
            random_state=42
        )
        
        model.fit(X_train, Y_train)
        
        Y_pred = model.predict(X_test)
        score = accuracy_score(Y_test, Y_pred)
    
        accuries.append(score)
    
    # print(accuries)
    print(np.mean(accuries))
    accuries.clear()

1.0
1.0
1.0
1.0
1.0
1.0


**The model perfectly fits the Data achieving 100% accuracy in every single fold**