# <ins>Model Building File</ins>

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

# Load encoded data
df = pd.read_csv('../data/processed/01_heart_engineered.csv')
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up,Age_x_Oldpeak,Age_x_Cholesterol,Oldpeak_sq
0,40,0,140.0,289.0,0,172,0,0.0,0,1,0,0,1,0,0,1,0.0,11560.0,0.0
1,49,1,160.0,180.0,0,156,0,1.0,1,0,1,0,1,0,1,0,49.0,8820.0,1.0
2,37,0,130.0,283.0,0,98,0,0.0,0,1,0,0,0,1,0,1,0.0,10471.0,0.0
3,48,1,138.0,214.0,0,108,1,1.5,1,0,0,0,1,0,1,0,72.0,10272.0,2.25
4,54,0,150.0,195.0,0,122,0,0.0,0,0,1,0,1,0,0,1,0.0,10530.0,0.0


In [45]:
print("Separate features and target variable")
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

Separate features and target variable


In [46]:
print("Feature Scaling")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Feature Scaling


In [47]:
print("Split the data into training and testing sets")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Split the data into training and testing sets
Training set shape: (734, 18)
Testing set shape: (184, 18)


In [48]:
experiment_results = []

In [49]:
print("------ Build Baseline Model ------")
# Initialize and train the model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model's performance
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

------ Build Baseline Model ------
Accuracy: 0.87

Confusion Matrix:
[[67 10]
 [14 93]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85        77
           1       0.90      0.87      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



In [50]:
# Capture Logistic Regression results
log_reg_results = {
    'model_name': 'Logistic Regression - 01_heart_engineered',
    'accuracy': accuracy_score(y_test, y_pred),
    'precision_1': precision_score(y_test, y_pred, pos_label=1),
    'recall_1': recall_score(y_test, y_pred, pos_label=1),
    'f1_score_1': f1_score(y_test, y_pred, pos_label=1),
    'notes': 'Baseline model with all features'
}
experiment_results.append(log_reg_results)

# <ins>Observations</ins>
Using 'heart_encoded' data:
- Model has an accuracy of 86%
- Model predicted: 68 True Negatives, 17 False Negatives, 9 False postives, 90 True Positives

In [51]:
print("------ Random Forest Model ------")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("--- Random Forest Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}\n")
print(classification_report(y_test, y_pred_rf))

------ Random Forest Model ------


--- Random Forest Results ---
Accuracy: 0.86

              precision    recall  f1-score   support

           0       0.82      0.84      0.83        77
           1       0.89      0.87      0.88       107

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



In [52]:
rf_results = {
    'model_name': 'Random Forest - 01_heart_engineered',
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision_1': precision_score(y_test, y_pred_rf, pos_label=1),
    'recall_1': recall_score(y_test, y_pred_rf, pos_label=1),
    'f1_score_1': f1_score(y_test, y_pred_rf, pos_label=1),
    'notes': 'Untuned RF with all features'
}
experiment_results.append(rf_results)

# <ins>Observations</ins>
Using 'heart_encoded' data:

- The random forrest model performed similar to the regression model so I may have reached the limit of prediction accuracy with the current features. I will now optimise the hyperparamters of the random forrest model to try and improve the performance.

In [53]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")

best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

print("\n--- Tuned Random Forest Results ---")
print(classification_report(y_test, y_pred_best))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

--- Tuned Random Forest Results ---
              precision    recall  f1-score   support

           0       0.81      0.83      0.82        77
           1       0.88      0.86      0.87       107

    accuracy                           0.85       184
   macro avg       0.84      0.85      0.84       184
weighted avg       0.85      0.85      0.85       184



In [54]:
tuned_rf_results = {
    'model_name': 'Tuned Random Forest - 01_heart_engineered',
    'accuracy': accuracy_score(y_test, y_pred_best),
    'precision_1': precision_score(y_test, y_pred_best, pos_label=1),
    'recall_1': recall_score(y_test, y_pred_best, pos_label=1),
    'f1_score_1': f1_score(y_test, y_pred_best, pos_label=1),
    'notes': 'GridSearchCV tuned RF'
}
experiment_results.append(tuned_rf_results)

In [55]:
# ----- Compare all experiment results -----
results_df = pd.DataFrame(experiment_results)

# Display the comparison table
print("\n--- Model Comparison ---")
print(results_df)

results_path = '../model_results.csv'
results_df.to_csv(results_path, mode = 'a', header = False, index=False)
print(f"Results saved to {results_path}")


--- Model Comparison ---
                                  model_name  accuracy  precision_1  recall_1  \
0  Logistic Regression - 01_heart_engineered  0.869565     0.902913  0.869159   
1        Random Forest - 01_heart_engineered  0.858696     0.885714  0.869159   
2  Tuned Random Forest - 01_heart_engineered  0.847826     0.876190  0.859813   

   f1_score_1                             notes  
0    0.885714  Baseline model with all features  
1    0.877358      Untuned RF with all features  
2    0.867925             GridSearchCV tuned RF  
Results saved to ../model_results.csv


# <ins>Observations</ins>

No improvement in accuracy therefore more feature engineering is required to create new more informative features from the current set.