# Step 1: Import Libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import BinaryEncoder
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


# Step 2: Load and Explore Dataset

In [15]:
# Load and Explore Dataset
data = pd.read_csv('CVD_dataset.csv')
print(data.head())
print(data.info())
print(data.isnull().sum())
# sns.pairplot(data)

  General_Health Exercise Skin_Cancer Other_Cancer Depression Diabetes  \
0           Poor       No          No           No         No       No   
1      Very Good       No          No           No         No      Yes   
2      Very Good      Yes          No           No         No      Yes   
3           Poor      Yes          No           No         No      Yes   
4           Good       No          No           No         No       No   

  Arthritis     Sex Age_Category  Height_(cm)  Weight_(kg)    BMI  \
0       Yes  Female        70-74          150        32.66  14.54   
1        No  Female        70-74          165        77.11  28.29   
2        No  Female        60-64          163        88.45  33.47   
3        No    Male        75-79          180        93.44  28.73   
4        No    Male          80+          191        88.45  24.37   

  Smoking_History  Alcohol_Consumption  Fruit_Consumption  \
0             Yes                    0                 30   
1              No 

# Step 3: Preprocess Data

In [16]:
# Preprocess Data
import pandas as pd
data.fillna(data.select_dtypes(include=np.number).mean(), inplace=True)
if 'Cardio_Disease' not in data.columns:
    raise KeyError("The dataset does not contain the 'Cardio_Disease' column. Please verify the dataset.")

# Encode Target Variable
y = data['Cardio_Disease'].map({'No': 0, 'Yes': 1})

# Encode Categorical Features
categorical_features = data.select_dtypes(include=['object']).columns.drop('Cardio_Disease', errors='ignore')

for col in categorical_features:
    if data[col].nunique() == 2:  # Binary Encoding for Two Unique Values
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    elif data[col].nunique() > 10:  # Binary Encoding for High Cardinality Features
        be = BinaryEncoder(cols=[col])
        data = be.fit_transform(data)
    else:  # One-Hot Encoding for Nominal Features
        ohe = OneHotEncoder(drop='first', sparse_output=False)
        encoded = pd.DataFrame(ohe.fit_transform(data[[col]]), columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data.drop(columns=[col]), encoded], axis=1)

# Select Numerical Features and Processed Categorical Features
X = data.drop('Cardio_Disease', axis=1, errors='ignore')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocess Data

# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# from category_encoders import BinaryEncoder
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split

# # Preprocess Data
# data.fillna(data.select_dtypes(include=np.number).mean(), inplace=True)
# if 'Cardio_Disease' not in data.columns:
#     raise KeyError("The dataset does not contain the 'Cardio_Disease' column. Please verify the dataset.")

# # Encode Target Variable
# y = data['Cardio_Disease'].map({'No': 0, 'Yes': 1})

# # Encode Categorical Features
# categorical_features = data.select_dtypes(include=['object']).columns.drop('Cardio_Disease', errors='ignore')

# for col in categorical_features:
#     if data[col].nunique() == 2:  # Binary Encoding for Two Unique Values (e.g., Yes/No)
#         le = LabelEncoder()
#         data[col] = le.fit_transform(data[col])
#     elif data[col].nunique() > 10:  # Binary Encoding for High Cardinality Features
#         be = BinaryEncoder(cols=[col])
#         data = be.fit_transform(data)
#     else:  # One-Hot Encoding for Nominal Features (few categories)
#         ohe = OneHotEncoder(drop='first', sparse_output=False)
#         encoded = pd.DataFrame(ohe.fit_transform(data[[col]]), columns=ohe.get_feature_names_out([col]))
#         data = pd.concat([data.drop(columns=[col]), encoded], axis=1)

# # Select Numerical Features and Processed Categorical Features
# X = data.drop('Cardio_Disease', axis=1, errors='ignore')

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





AttributeError: module 'pandas.api.types' has no attribute 'is_categorical'

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import BinaryEncoder

# Preprocess Data
data.fillna(data.select_dtypes(include=np.number).mean(), inplace=True)
if 'Cardio_Disease' not in data.columns:
    raise KeyError("The dataset does not contain the 'Cardio_Disease' column. Please verify the dataset.")

# Encode Target Variable
y = data['Cardio_Disease'].map({'No': 0, 'Yes': 1})

# Encode Categorical Features
categorical_features = data.select_dtypes(include=['object']).columns.drop('Cardio_Disease', errors='ignore')

for col in categorical_features:
    if data[col].nunique() == 2:  # Binary Encoding for Two Unique Values
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    elif data[col].nunique() > 10:  # Binary Encoding for High Cardinality Features
        be = BinaryEncoder(cols=[col])
        data = be.fit_transform(data)
    else:  # One-Hot Encoding for Nominal Features
        ohe = OneHotEncoder(drop='first', sparse_output=False)
        encoded = pd.DataFrame(ohe.fit_transform(data[[col]]), columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data.drop(columns=[col]), encoded], axis=1)

# Select Numerical Features and Processed Categorical Features
X = data.drop('Cardio_Disease', axis=1, errors='ignore')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


AttributeError: module 'pandas.api.types' has no attribute 'is_categorical'

# Step 4: Define Models

In [None]:
# Define Base Models
base_models = [
    ('Random Forest', RandomForestClassifier()),
    ('SVM', SVC(probability=True)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('KNN', KNeighborsClassifier()),
    ('XGBoost', XGBClassifier(tree_method='gpu_hist' if device.type == 'cuda' else 'auto')),
    ('Gradient Boosting', GradientBoostingClassifier())
]

# Create an Ensemble Model using Stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

# Step 5: Train Models with Epoch-Like Iteration and Progress Bar

In [None]:
# Train Stacking Model
print("Training Stacking Model...")
stacking_model.fit(X_train, y_train)
train_score = stacking_model.score(X_train, y_train)
val_score = stacking_model.score(X_test, y_test)
print(f"Stacking Model Train Accuracy: {train_score:.4f}, Validation Accuracy: {val_score:.4f}")

Training Stacking Model...


# Train Stacking Model with Epochs

In [None]:
# Train Stacking Model with Epochs
epochs = 10
history = {'epoch': [], 'train_accuracy': [], 'val_accuracy': []}

print("Training Stacking Model...")
for epoch in range(1, epochs + 1):
    stacking_model.fit(X_train, y_train)
    train_score = stacking_model.score(X_train, y_train)
    val_score = stacking_model.score(X_test, y_test)
    history['epoch'].append(epoch)
    history['train_accuracy'].append(train_score)
    history['val_accuracy'].append(val_score)
    print(f"Epoch {epoch}: Train Accuracy = {train_score:.4f}, Validation Accuracy = {val_score:.4f}")
