In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, matthews_corrcoef)

import matplotlib.pyplot as plt

# Split data.csv into train and test CSV files
df_original = pd.read_csv('heart.csv')
#https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset

#df_original = df_original.drop_duplicates().reset_index(drop=True)
df_original = df_original.fillna(df_original.median(numeric_only=True))

# Separate features and target
X_full = df_original.iloc[:, :-1]
y_full = df_original.iloc[:, -1]

# Perform 80-20 train-test split
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)

# Combine features and target for both datasets
train_df = pd.concat([X_train_split, y_train_split], axis=1)
test_df = pd.concat([X_test_split, y_test_split], axis=1)

# Save to CSV files
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

print(f"Original data shape: {df_original.shape}")
print(f"Training data saved to 'train_data.csv': {train_df.shape[0]} rows, {train_df.shape[1]} columns")
print(f"Test data saved to 'test_data.csv': {test_df.shape[0]} rows, {test_df.shape[1]} columns")

age
sex
chest pain type (4 values)
resting blood pressure
serum cholestoral in mg/dl
fasting blood sugar > 120 mg/dl
resting electrocardiographic results (values 0,1,2)
maximum heart rate achieved
exercise induced angina
oldpeak = ST depression induced by exercise relative to rest
the slope of the peak exercise ST segment
number of major vessels (0-3) colored by flourosopy
thal: 0 = normal; 1 = fixed defect; 2 = reversable defect The names and social security numbers of the patients were recently removed 

In [14]:
# Load the training dataset
df = pd.read_csv('train_data.csv')

X_train = df.iloc[:, :-1]
y_train = df.iloc[:, -1]

print(f"Training set size: {X_train.shape[0]}")
print(f"Original features: {X_train.shape[1]}")

# Define feature types based on data dictionary
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Filter to only include columns that exist in the dataset
numerical_cols = [col for col in numerical_cols if col in X_train.columns]
categorical_cols = [col for col in categorical_cols if col in X_train.columns]

print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# Create preprocessing pipeline
if len(categorical_cols) > 0:
    # One-hot encoding for categorical features and scaling for numerical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
        ])
else:
    # Only scaling if no categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols)
        ])

# Fit and transform training data
X_train_processed = preprocessor.fit_transform(X_train)

# Save preprocessor and models in trained_models directory
models_dir = 'trained_models'
os.makedirs(models_dir, exist_ok=True)

# Save the preprocessor for later use on test data
preprocessor_path = os.path.join(models_dir, 'preprocessor.pkl')
with open(preprocessor_path, 'wb') as f:
    pickle.dump(preprocessor, f)

print(f"\nPreprocessor fitted and saved to '{preprocessor_path}'")
print(f"Processed features: {X_train_processed.shape[1]}")

Training set size: 820
Original features: 13

Numerical columns (5): ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Categorical columns (8): ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

Preprocessor fitted and saved to 'trained_models\preprocessor.pkl'
Processed features: 22


In [15]:
# Calculate imbalance ratio for XGBoost
imbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class imbalance ratio: {imbalance_ratio:.2f}")

# Create instances of all 6 models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=2000, class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'K-Nearest Neighbors': KNeighborsClassifier(),  # KNN doesn't support class_weight
    'Naive Bayes': GaussianNB(),  # Naive Bayes doesn't support class_weight
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=200),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss', scale_pos_weight=imbalance_ratio)
}

# Train all models on processed data
trained_models = {}
for name, model in models.items():
    model.fit(X_train_processed, y_train)
    trained_models[name] = model
    print(f"{name} trained successfully")

Class imbalance ratio: 0.94
Logistic Regression trained successfully
Decision Tree trained successfully
K-Nearest Neighbors trained successfully
Naive Bayes trained successfully
Random Forest trained successfully
XGBoost trained successfully


In [16]:
# Load test data for evaluation
test_df = pd.read_csv('test_data.csv')

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

print(f"Test set size: {X_test.shape[0]}")

# Apply the same preprocessing to test data
X_test_processed = preprocessor.transform(X_test)

print(f"Processed test features: {X_test_processed.shape[1]}")

# Evaluate all models
results = {}

for name, model in trained_models.items():
    # Make predictions on processed test data
    y_pred = model.predict(X_test_processed)
    y_pred_proba = model.predict_proba(X_test_processed)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'AUC Score': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'MCC Score': mcc
    }

# Create a summary dataframe
results_df = pd.DataFrame(results).T
print("\n" + "="*50)
print("Model Performance Summary:")
print(results_df)
print("="*50)

Test set size: 205
Processed test features: 22

Model Performance Summary:
                     Accuracy  AUC Score  Precision    Recall  F1 Score  \
Logistic Regression  0.819512   0.908148   0.779661  0.893204  0.832579   
Decision Tree        0.985366   0.985437   1.000000  0.970874  0.985222   
K-Nearest Neighbors  0.790244   0.929516   0.763158  0.844660  0.801843   
Naive Bayes          0.804878   0.842328   0.769231  0.873786  0.818182   
Random Forest        1.000000   1.000000   1.000000  1.000000  1.000000   
XGBoost              0.985366   1.000000   1.000000  0.970874  0.985222   

                     MCC Score  
Logistic Regression   0.645720  
Decision Tree         0.971151  
K-Nearest Neighbors   0.583632  
Naive Bayes           0.615261  
Random Forest         1.000000  
XGBoost               0.971151  


In [18]:
# Save the models with feature engineering
print("Saving models to trained_models directory...\n")

for name, model in trained_models.items():
    # Create safe filename
    filename = name.replace(' ', '_').lower() + '.pkl'
    filepath = os.path.join(models_dir, filename)
    
    # Save model
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    
    print(f"Saved {name} to {filepath}")

Saving models to trained_models directory...

Saved Logistic Regression to trained_models\logistic_regression.pkl
Saved Decision Tree to trained_models\decision_tree.pkl
Saved K-Nearest Neighbors to trained_models\k-nearest_neighbors.pkl
Saved Naive Bayes to trained_models\naive_bayes.pkl
Saved Random Forest to trained_models\random_forest.pkl
Saved XGBoost to trained_models\xgboost.pkl
