# Assignment 4 - Part 1: Predicting Heart Disease Using a Classification Tree

This notebook implements a classification tree model to predict whether a person is likely to have heart disease.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

# Set random state for reproducibility
RANDOM_STATE = 123
np.random.seed(RANDOM_STATE)

## 1.1 Data Cleaning (2 points)

In [None]:
# Load the dataset
column_names = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'hd']

df = pd.read_csv('../input/processed.cleveland.data', 
                 names=column_names, 
                 na_values='?')

print("Original dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Remove missing values
df = df.dropna()
print("\nDataset shape after removing missing values:", df.shape)

In [None]:
# Create binary variable y (1 if heart disease, 0 otherwise)
# hd > 0 indicates heart disease
y = (df['hd'] > 0).astype(int)
print("Distribution of target variable:")
print(y.value_counts())

# Drop the original hd column from features
X = df.drop('hd', axis=1)

In [None]:
# Identify categorical variables to convert to dummy variables
# Categorical variables: sex, cp, fbs, restecg, exang, slope, ca, thal
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)

print("Features after creating dummy variables:")
print(X.columns.tolist())
print("\nDataset shape:", X.shape)

## 1.2 Data Analysis (8 points)

### (1 point) Split data and plot classification tree

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Train a classification tree without pruning
clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
clf.fit(X_train, y_train)

# Plot the classification tree
plt.figure(figsize=(20, 10))
plot_tree(clf, 
          feature_names=X.columns, 
          class_names=['No HD', 'Has HD'],
          filled=True, 
          rounded=True,
          fontsize=10)
plt.title('Classification Tree (Before Pruning)', fontsize=16)
plt.savefig('../output/classification_tree_before_pruning.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Tree depth: {clf.get_depth()}")
print(f"Number of leaves: {clf.get_n_leaves()}")

### (2 points) Plot confusion matrix and interpret results

In [None]:
# Make predictions on test set
y_pred = clf.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=["Does not have HD", "Has HD"],
            yticklabels=["Does not have HD", "Has HD"])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix (Before Pruning)')
plt.savefig('../output/confusion_matrix_before_pruning.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")
print(f"\nConfusion Matrix:")
print(f"True Negatives: {cm[0, 0]}")
print(f"False Positives: {cm[0, 1]}")
print(f"False Negatives: {cm[1, 0]}")
print(f"True Positives: {cm[1, 1]}")

**Interpretation:**
- The confusion matrix shows the performance of our classification model.
- True Negatives: correctly predicted individuals without heart disease
- True Positives: correctly predicted individuals with heart disease
- False Positives: incorrectly predicted as having heart disease
- False Negatives: incorrectly predicted as not having heart disease (more concerning in medical diagnosis)

### (1.5 points) Fix overfitting using cross-validation

In [None]:
# Generate 50 values of alpha equally spaced on a logarithmic scale between e^-10 and 0.05
alpha_values = np.logspace(np.log(np.exp(-10)), np.log(0.05), 50, base=np.e)

print(f"Number of alpha values: {len(alpha_values)}")
print(f"Alpha range: {alpha_values.min():.10f} to {alpha_values.max():.4f}")

In [None]:
# Perform 4-fold cross-validation for each alpha
from sklearn.model_selection import cross_val_score

mean_accuracies = []
std_accuracies = []

for alpha in alpha_values:
    clf_temp = DecisionTreeClassifier(ccp_alpha=alpha, random_state=RANDOM_STATE)
    scores = cross_val_score(clf_temp, X_train, y_train, cv=4, scoring='accuracy')
    mean_accuracies.append(scores.mean())
    std_accuracies.append(scores.std())

# Find optimal alpha (maximum mean accuracy)
optimal_idx = np.argmax(mean_accuracies)
optimal_alpha = alpha_values[optimal_idx]
optimal_accuracy = mean_accuracies[optimal_idx]

print(f"Optimal alpha: {optimal_alpha:.10f}")
print(f"Optimal cross-validation accuracy: {optimal_accuracy:.4f}")

### (1.5 points) Plot Inaccuracy Rate vs Alpha

In [None]:
# Calculate inaccuracy rate (1 - Accuracy)
inaccuracy_rates = [1 - acc for acc in mean_accuracies]

# Plot Inaccuracy Rate vs Alpha
plt.figure(figsize=(10, 6))
plt.plot(alpha_values, inaccuracy_rates, marker='o', markersize=3)
plt.axvline(x=optimal_alpha, color='r', linestyle='--', 
            label=f'Optimal α = {optimal_alpha:.6f}')
plt.xscale('log')
plt.xlabel('Alpha (log scale)', fontsize=12)
plt.ylabel('Inaccuracy Rate (1 - Accuracy)', fontsize=12)
plt.title('Inaccuracy Rate vs Alpha', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('../output/inaccuracy_vs_alpha.png', dpi=300, bbox_inches='tight')
plt.show()

### (2 points) Plot pruned tree and confusion matrix with optimal alpha

In [None]:
# Train a classification tree with optimal alpha
clf_pruned = DecisionTreeClassifier(ccp_alpha=optimal_alpha, random_state=RANDOM_STATE)
clf_pruned.fit(X_train, y_train)

# Plot the pruned classification tree
plt.figure(figsize=(20, 10))
plot_tree(clf_pruned, 
          feature_names=X.columns, 
          class_names=['No HD', 'Has HD'],
          filled=True, 
          rounded=True,
          fontsize=10)
plt.title(f'Classification Tree (After Pruning with α = {optimal_alpha:.6f})', fontsize=16)
plt.savefig('../output/classification_tree_after_pruning.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Pruned tree depth: {clf_pruned.get_depth()}")
print(f"Pruned tree number of leaves: {clf_pruned.get_n_leaves()}")

In [None]:
# Make predictions with pruned tree
y_pred_pruned = clf_pruned.predict(X_test)

# Calculate confusion matrix for pruned tree
cm_pruned = confusion_matrix(y_test, y_pred_pruned)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_pruned, annot=True, fmt='d', cmap='Blues', 
            xticklabels=["Does not have HD", "Has HD"],
            yticklabels=["Does not have HD", "Has HD"])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix (After Pruning)')
plt.savefig('../output/confusion_matrix_after_pruning.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate accuracy
accuracy_pruned = accuracy_score(y_test, y_pred_pruned)
print(f"Accuracy on test set (pruned): {accuracy_pruned:.4f}")
print(f"\nConfusion Matrix (Pruned):")
print(f"True Negatives: {cm_pruned[0, 0]}")
print(f"False Positives: {cm_pruned[0, 1]}")
print(f"False Negatives: {cm_pruned[1, 0]}")
print(f"True Positives: {cm_pruned[1, 1]}")

**Discussion:**

After pruning the decision tree using the optimal alpha value obtained through 4-fold cross-validation:

1. **Tree Complexity:** The pruned tree is significantly simpler with fewer nodes and lower depth compared to the unpruned tree, reducing overfitting.

2. **Model Performance:** The pruned model may show similar or slightly different accuracy compared to the unpruned model on the test set. The key benefit is better generalization.

3. **Interpretability:** The simpler pruned tree is easier to interpret and explain, which is crucial in medical applications.

4. **Cross-validation:** The use of cross-validation helps ensure that the selected alpha parameter leads to a model that generalizes well to unseen data.

5. **Trade-off:** There's a trade-off between model complexity and performance. The optimal alpha balances this trade-off by preventing overfitting while maintaining good predictive accuracy.