In [38]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb
import joblib
import tensorflow as tf # Line of code generated by ChatGPT 5.1 on 12/10/25
import random # Line of code generated by ChatGPT 5.1 on 12/10/25
import os

In [39]:
# Set random seeds.
# The following 4 lines of code were generated by ChatGPT 5.1 on 12/10/25
os.environ["PYTHONHASHSEED"] = "42"
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [40]:
# Load X and y.
def load_data(file_path):
    df = pd.read_csv("heart.csv")
    y = df["target"]
    X = df.drop(columns=["target"])
    return X, y, df

# Print whether there are any missing values in the passed in pandas DataFrame.
def are_there_missing_vals(df):
    no_missing_vals = True

    # Report counts of missing values per column.
    print("-------------------------------------------------------------------")
    print("Number of Missing Values per Column:")
    for col in df.columns:
        if df[col].isna().sum() != 0:
            no_missing_vals = False
        print(col + ": " + str(df[col].isna().sum()))

    if no_missing_vals:
        print("Verdict: There are no missing values in the dataset. No need to worry about addressing any missing values later on in data preprocessing.")
    else:
        print("Verdict: There are missing values in the dataset. They will need to be addressed later on in data preprocessing.")

In [41]:
# Rubric Item Covered: Implemented proper train/validation/test split with documented split ratios (3 pts)
# Perform proper train/validation/test split.
def train_val_test_split(X, y, val_size, test_size):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(val_size/(1-test_size)), random_state=42)
    return X_train, y_train, X_val, y_val, X_test, y_test

# Rubric Item Covered: Implemented proper train/validation/test split with documented split ratios (3 pts)
# Document split ratios.
def document_split_ratios(val_size, test_size, y_train, y_val, y_test):
    print("-------------------------------------------------------------------")
    print("Documented Split Ratios:")
    print("Train Split Ratio:", (1 - (val_size + test_size)))
    print("Train Split Size:", len(y_train))
    print("Validation Split Ratio:", val_size)
    print("Validation Split Size:", len(y_val))
    print("Test Split Ratio:", test_size)
    print("Test Split Size:", len(y_test))

In [42]:
# Engineer new features, append them to X, and return the new X.
def feature_engineer_train_data(X):
    # Rubric Item Covered: Applied feature engineering (created polynomial features, embeddings, or other derived features) (5 pts)
    age_squared = X["age"] * X["age"]
    X["age_squared"] = age_squared
    cholesterol_blood_pressure_interaction = X["chol"] * X["trestbps"]
    X["chol_times_trestbps"] = cholesterol_blood_pressure_interaction
    age_heart_rate_interaction = X["age"] * X["thalach"]
    X["age_times_thalach"] =  age_heart_rate_interaction
    oldpeak_squared = X["oldpeak"] * X["oldpeak"]
    X["oldpeak_squared"] = oldpeak_squared
    return X

In [43]:
# Preprocess the training data.
def preprocess_train_data(X_train, y_train):
    # Rubric Item Covered: Properly normalized or standardized input features/data appropriate to your modality (3 pts)
    stand_scaler = StandardScaler()
    X_train_standardized = stand_scaler.fit_transform(X_train, y_train)

    # Rubric Item Covered: Implemented preprocessing pipeline handling data quality issues (addresses class imbalance, missing data, outliers, text tokenization, image resizing, with evidence of impact) (5 pts)
    smote_sampler = SMOTE(sampling_strategy='not majority', random_state=42)
    X_train_resampled, y_train_resampled = smote_sampler.fit_resample(X_train_standardized, y_train)
    print("-------------------------------------------------------------------")
    print("Evidence of Impact (Class Imbalance):")
    print("Number of target = 0 Samples in y_train (pre-SMOTE):", y_train[y_train == 0].sum())
    print("Number of target = 1 Samples in y_train (pre-SMOTE):", y_train[y_train == 1].sum())
    print("Number of target = 0 Samples in y_train_resampled (post-SMOTE):", y_train_resampled[y_train_resampled == 0].sum())
    print("Number of target = 1 Samples in y_train_resampled (post-SMOTE):", y_train_resampled[y_train_resampled == 1].sum())

    rob_scaler = RobustScaler()
    X_train_transformed = rob_scaler.fit_transform(X_train_resampled, y_train_resampled)
    print("-------------------------------------------------------------------")
    print("Evidence of Impact (Outliers):")
    after_median = np.median(X_train_transformed, axis=0) # Generated by ChatGPT 5.1 On 12/9/25
    after_iqr = np.percentile(X_train_transformed, 75, axis=0) - np.percentile(X_train_transformed, 25, axis=0) # Generated by ChatGPT 5.1 On 12/9/25
    before_median = np.median(X_train_resampled, axis=0) # Based on after_median line
    before_iqr = np.percentile(X_train_resampled, 75, axis=0) # Based on after_iqr line
    print("Median values BEFORE RobustScaler:", before_median) # Generated by ChatGPT 5.1 On 12/9/25 but then modified to show more than 5 features
    print("Median values AFTER RobustScaler:", after_median) # Generated by ChatGPT 5.1 On 12/9/25 but then modified to show more than 5 features
    print("IQR values BEFORE RobustScaler:", before_iqr) # Generated by ChatGPT 5.1 On 12/9/25 but then modified to show more than 5 features
    print("IQR values AFTER RobustScaler:", after_iqr) # Generated by ChatGPT 5.1 On 12/9/25 but then modified to show more than 5 features
    print("-------------------------------------------------------------------")

    return stand_scaler, rob_scaler, X_train_transformed, y_train_resampled

In [44]:
# Preprocess the evaluation data (whether that be validation data or test data).
def preprocess_eval_data(X, fitted_stand_scaler, fitted_rob_scaler):
    X_post_stand_transform = fitted_stand_scaler.transform(X)
    X_post_rob_transform = fitted_rob_scaler.transform(X_post_stand_transform)
    return X_post_rob_transform

In [45]:
# Rubric Item Covered: Used a significant software framework for applied ML not
# covered in the class (e.g., instead of PyTorch, used Tensorflow; or used JAX,
# LangChain, etc. not covered in the class) (5 pts)
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import L2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
import math

# Rubric Item Covered: Used a significant software framework for applied ML not
# covered in the class (e.g., instead of PyTorch, used Tensorflow; or used JAX,
# LangChain, etc. not covered in the class) (5 pts)

# Create a custom TensorFlow binary classification MLP model.
def create_tf_model(num_input_features, alpha):
    # Rubric Item Covered: Defined and trained a custom (substantially designed by you, not a pretrained model) neural network architecture using PyTorch or similar framework (5 pts)
    model = Sequential()
    model.add(Input(shape=(num_input_features,)))
    # Rubric Item Covered: Applied regularization techniques to prevent overfitting (at least two of: L1/L2 penalty, dropout, early stopping) (5 pts)) - Technique 1: L2 penalty
    model.add(Dense(units=num_input_features*2, kernel_regularizer=L2(alpha), activation='relu'))
    model.add(Dense(units=num_input_features, kernel_regularizer=L2(alpha), activation='relu'))
    model.add(Dense(units=math.ceil(num_input_features/2), kernel_regularizer=L2(alpha), activation='relu'))
    model.add(Dense(units=math.ceil(num_input_features/4), kernel_regularizer=L2(alpha), activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))
    return model

# Train the TensorFlow model.
def train_tf_model(model, X_train, y_train, X_val, y_val):
    # Rubric Item Covered: Defined and trained a custom (substantially designed by you, not a pretrained model) neural network architecture using PyTorch or similar framework (5 pts)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
    # Rubric Item Covered: Applied regularization techniques to prevent overfitting (at least two of: L1/L2 penalty, dropout, early stopping) (5 pts)) - Technique 2: Early Stopping
    early_stop_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) # restore_best_weights=True portion of line of code generated by ChatGPT 5.1 on 12/9/25
    train_history = model.fit(x=X_train, y=y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stop_callback], shuffle=False)
    return model, train_history

In [46]:
# Extrapolate validation metrics from the passed in history.
def extrapolate_validation_metrics(train_history):
    validation_accuracy = train_history.history['val_accuracy'][-1] # Line of code generated by ChatGPT 5.1 on 12/9/25
    validation_precision = train_history.history['val_Precision'][-1]
    validation_recall = train_history.history['val_Recall'][-1]
    validation_auc = train_history.history['val_AUC'][-1]
    return [validation_accuracy, validation_precision, validation_recall, validation_auc]

# Compare the validation metrics of the official model against the latest model.
def compare_validation_metrics(official_validation_metrics, validation_metrics):
    num_better_metrics_official = 0
    num_better_metrics_other = 0
    for x in range(4):
        if official_validation_metrics[x] >= validation_metrics[x]:
            num_better_metrics_official += 1
        elif official_validation_metrics[x] < validation_metrics[x]:
            num_better_metrics_other += 1
    # Rubric Item Covered: Conducted systematic hyperparameter tuning using validation data or cross-validation (evidence: comparison of multiple configurations) (5 pts)
    if num_better_metrics_official >= num_better_metrics_other:
        return "The official model is better"
    else:
        return "The latest model is better"

# Run the training pipeline for the custom TensorFlow model.
def run_tf_training_pipeline(num_input_features, X_train, y_train, X_val, y_val):
    # Rubric Item Covered: Conducted systematic hyperparameter tuning using validation data or cross-validation (evidence: comparison of multiple configurations) (5 pts)

    # Perform hyperparameter tuning to train the most optimal model for predicting y given X.
    model_to_use_for_evaluation = None
    official_validation_metrics = None
    chosen_c = None
    c_values = [0.001, 0.01, 0.1, 1, 10, 100]
    for c in c_values:
        print("-------------------------------------------------------------------")
        print("Training with C =", c)
        # Rubric Item Covered: Defined and trained a custom (substantially designed by you, not a pretrained model) neural network architecture using PyTorch or similar framework (5 pts)
        model = create_tf_model(num_input_features, c)
        model, train_history = train_tf_model(model, X_train, y_train, X_val, y_val)
        validation_metrics = extrapolate_validation_metrics(train_history)
        if (model_to_use_for_evaluation == None) and (official_validation_metrics == None):
            model_to_use_for_evaluation = model
            official_validation_metrics = validation_metrics
            chosen_c = c
        else:
            if compare_validation_metrics(official_validation_metrics, validation_metrics) == "The latest model is better":
                model_to_use_for_evaluation = model
                official_validation_metrics = validation_metrics
                chosen_c = c
    print("Chosen Value of C:", chosen_c)
    return model_to_use_for_evaluation

# Evaluate the passed in model on the test data.
def evaluate_tf_model(model, X_test, y_test):
    test_loss, test_accuracy, test_precision, test_recall, test_auc = model.evaluate(X_test, y_test)
    # Rubric Item Covered: Used at least three distinct and appropriate evaluation metrics for your task (3 pts)
    print("-------------------------------------------------------------------")
    print("Test Accuracy:", test_accuracy)
    print("Test Precision:", test_precision)
    print("Test Recall:", test_recall)
    print("Test AUC:", test_auc)
    print("-------------------------------------------------------------------")
    return test_accuracy, test_precision, test_recall, test_auc

In [47]:
# Run the training pipeline for the Logistic Regression CV model.
def run_log_reg_cv_training_pipeline(X_train_preprocessed, y_train_preprocessed):
    log_reg_cv = LogisticRegressionCV(cv=5, penalty='l2', random_state=42)
    log_reg_cv.fit(X_train_preprocessed, y_train_preprocessed)
    return log_reg_cv

# Evaluate the Logistic Regression CV model's performance on the test data.
def evaluate_log_reg_cv_model(log_reg_cv, X_test_transformed, y_test):
    y_test_predictions = log_reg_cv.predict(X_test_transformed)
    log_reg_cv_test_accuracy = accuracy_score(y_test, y_test_predictions)
    log_reg_cv_test_precision = precision_score(y_test, y_test_predictions)
    log_reg_cv_test_recall = recall_score(y_test, y_test_predictions)
    log_reg_cv_test_auc = roc_auc_score(y_test, y_test_predictions)
    return log_reg_cv_test_accuracy, log_reg_cv_test_precision, log_reg_cv_test_recall, log_reg_cv_test_auc

# Generated by ChatGPT 5.1 on 12/10/25
def run_xgb_model_training_pipeline(X_train_preprocessed, y_train_preprocessed,
                                    X_val_transformed, y_val):

    # Convert numpy arrays into DMatrix objects
    dtrain = xgb.DMatrix(X_train_preprocessed, label=y_train_preprocessed)
    dval = xgb.DMatrix(X_val_transformed, label=y_val)

    # Set model parameters
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": 0.05,
        "max_depth": 3,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42,
        "deterministic_histogram": True,
    }

    # Train model with early stopping
    evals = [(dtrain, "train"), (dval, "validation")]

    xgb_model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=20,
        verbose_eval=False,
    )

    return xgb_model

# Method generated by ChatGPT 5.1 on 12/10/25
def evaluate_xgb_model(xgb_model, X_test_transformed, y_test):
    dtest = xgb.DMatrix(X_test_transformed)
    y_test_proba = xgb_model.predict(dtest)

    # Convert probabilities → class predictions
    y_test_predictions = (y_test_proba >= 0.5).astype(int)

    xgb_test_accuracy = accuracy_score(y_test, y_test_predictions)
    xgb_test_precision = precision_score(y_test, y_test_predictions)
    xgb_test_recall = recall_score(y_test, y_test_predictions)
    xgb_test_auc = roc_auc_score(y_test, y_test_proba)

    return xgb_test_accuracy, xgb_test_precision, xgb_test_recall, xgb_test_auc

# Compare the custom TensorFlow model's test metrics with those of the Logistic Regression model and the Random Forest model.
def compare_test_metrics(tf_test_metrics, log_cv_test_metrics, xgb_model_test_metrics):
    # Rubric Item Covered: Compared multiple model architectures or approaches quantitatively (5 pts)
    metric_types = ["Test Accuracy", "Test Precision", "Test Recall", "Test AUC"]
    for x in range(4):
        print("-------------------------------------------------------------------")
        print("TensorFlow Custom Model " + metric_types[x] + ": " + str(tf_test_metrics[x]))
        print("LogisticRegressionCV Model " + metric_types[x] + ": " + str(log_cv_test_metrics[x]))
        print("XGBoost Model " + metric_types[x] + ": " + str(xgb_model_test_metrics[x]))

        best_x_metric = max(tf_test_metrics[x], log_cv_test_metrics[x], xgb_model_test_metrics[x])
        if best_x_metric == tf_test_metrics[x]:
            print("The TensorFlow Custom Model has the best " + metric_types[x])
        elif best_x_metric == log_cv_test_metrics[x]:
            print("The LogisticRegressionCV Model has the best " + metric_types[x])
        else:
            print("The XGBoost Model has the best " + metric_types[x])

        if not(best_x_metric == tf_test_metrics[x]):
            print("Difference Between Best Model and TensorFlow Custom Model for " + metric_types[x] + ": " + str(best_x_metric - tf_test_metrics[x]) + " -> " + str(100*(best_x_metric - tf_test_metrics[x])) + "% difference")

In [48]:
# Dump the fitted scalers.
def dump_fitted_scalers(fitted_stand_scaler, fitted_rob_scaler):
    # Following two lines of code generated by ChatGPT 5.1 On 12/10/25
    joblib.dump(fitted_stand_scaler, "standard_scaler.pkl")
    joblib.dump(fitted_rob_scaler, "robust_scaler.pkl")

In [49]:
def main():
    # Load X and y
    X, y, df = load_data("heart.csv")

    # Print whether there are any missing values in the dataset
    are_there_missing_vals(df)

    # Engineer new features, append them to X, and return the new X.
    engineered_X = feature_engineer_train_data(X)

    # Perform proper train/validation/test split
    X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(engineered_X, y, val_size=0.15, test_size=0.15)

    # Document split ratios.
    document_split_ratios(val_size=0.15, test_size=0.15, y_train=y_train, y_val=y_val, y_test=y_test)

    # Preprocess the train data and get the fitted pipeline.
    fitted_stand_scaler, fitted_rob_scaler, X_train_preprocessed, y_train_preprocessed = preprocess_train_data(X_train, y_train)

    # Transform the validation data.
    X_val_transformed = preprocess_eval_data(X_val, fitted_stand_scaler, fitted_rob_scaler)

    # Transform the test data.
    X_test_transformed = preprocess_eval_data(X_test, fitted_stand_scaler, fitted_rob_scaler)

    """
    Train the custom TensorFlow binary classification MLP (the model that will be deployed) and evaluate its performance on the test data
    """
    # Run the training pipeline and get a custom TensorFlow model that is adequately trained on the training data and that generalizes well to unseen data.
    tf_mlp = run_tf_training_pipeline(17, X_train_preprocessed, y_train_preprocessed, X_val_transformed, y_val)

    # Evaluate the custom TensorFlow model's performance on the test data.
    tf_test_accuracy, tf_test_precision, tf_test_recall, tf_test_auc = evaluate_tf_model(tf_mlp, X_test_transformed, y_test)

    # Export state dictionary of custom TensorFlow model.
    tf_mlp.save("tf_mlp.keras") # Generated by ChatGPT 5.1 On 12/9/25

    # Dump the fitted scalers.
    dump_fitted_scalers(fitted_stand_scaler, fitted_rob_scaler)

    # Rubric Item Covered: Compared multiple model architectures or approaches quantitatively (5 pts)

    """
    Train a Logistic Regression CV model (with L2 penalty) on the training data and evaluate its performance on the test data
    """
    log_reg_cv = run_log_reg_cv_training_pipeline(X_train_preprocessed, y_train_preprocessed)
    log_reg_cv_test_accuracy, log_reg_cv_test_precision, log_reg_cv_test_recall, log_reg_cv_test_auc = evaluate_log_reg_cv_model(log_reg_cv, X_test_transformed, y_test)

    """
    Train a XGBoost model on the training data and evaluate its performance on the test data
    """
    xgb_model = run_xgb_model_training_pipeline(X_train_preprocessed, y_train_preprocessed, X_val_transformed, y_val)
    xgb_model_test_accuracy, xgb_model_test_precision, xgb_model_test_recall, xgb_model_test_auc = evaluate_xgb_model(xgb_model, X_test_transformed, y_test)

    # Compare the custom TensorFlow model's test metrics with those of the Logistic Regression model and the XGBoost model.
    tf_test_metrics = [tf_test_accuracy, tf_test_precision, tf_test_recall, tf_test_auc]
    log_cv_test_metrics = [log_reg_cv_test_accuracy, log_reg_cv_test_precision, log_reg_cv_test_recall, log_reg_cv_test_auc]
    xgb_model_test_metrics = [xgb_model_test_accuracy, xgb_model_test_precision, xgb_model_test_recall, xgb_model_test_auc]
    compare_test_metrics(tf_test_metrics, log_cv_test_metrics, xgb_model_test_metrics)

if __name__ == '__main__':
    main()

-------------------------------------------------------------------
Number of Missing Values per Column:
age: 0
sex: 0
cp: 0
trestbps: 0
chol: 0
fbs: 0
restecg: 0
thalach: 0
exang: 0
oldpeak: 0
slope: 0
ca: 0
thal: 0
target: 0
Verdict: There are no missing values in the dataset. No need to worry about addressing any missing values later on in data preprocessing.
-------------------------------------------------------------------
Documented Split Ratios:
Train Split Ratio: 0.7
Train Split Size: 717
Validation Split Ratio: 0.15
Validation Split Size: 154
Test Split Ratio: 0.15
Test Split Size: 154
-------------------------------------------------------------------
Evidence of Impact (Class Imbalance):
Number of target = 0 Samples in y_train (pre-SMOTE): 0
Number of target = 1 Samples in y_train (pre-SMOTE): 370
Number of target = 0 Samples in y_train_resampled (post-SMOTE): 0
Number of target = 1 Samples in y_train_resampled (post-SMOTE): 370
---------------------------------------------

Parameters: { "deterministic_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()


-------------------------------------------------------------------
TensorFlow Custom Model Test Accuracy: 0.9610389471054077
LogisticRegressionCV Model Test Accuracy: 0.7857142857142857
XGBoost Model Test Accuracy: 0.9675324675324676
The XGBoost Model has the best Test Accuracy
Difference Between Best Model and TensorFlow Custom Model for Test Accuracy: 0.006493520427059862 -> 0.6493520427059862% difference
-------------------------------------------------------------------
TensorFlow Custom Model Test Precision: 0.9784075617790222
LogisticRegressionCV Model Test Precision: 0.7415730337078652
XGBoost Model Test Precision: 0.9733333333333334
The TensorFlow Custom Model has the best Test Precision
-------------------------------------------------------------------
TensorFlow Custom Model Test Recall: 0.9487179517745972
LogisticRegressionCV Model Test Recall: 0.868421052631579
XGBoost Model Test Recall: 0.9605263157894737
The XGBoost Model has the best Test Recall
Difference Between Best