# Assignment 4 - Part 1: Predicting Heart Disease Using a Classification Tree (R)

This notebook implements a classification tree model to predict whether a person is likely to have heart disease using R.

In [None]:
# Load necessary libraries
library(rpart)
library(rpart.plot)
library(caret)
library(ggplot2)
library(dplyr)

# Set random seed for reproducibility
set.seed(123)

## 1.1 Data Cleaning (2 points)

In [None]:
# Load the dataset
column_names <- c('age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg', 
                  'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'hd')

df <- read.csv('../input/processed.cleveland.data', 
               header = FALSE,
               col.names = column_names,
               na.strings = '?')

cat("Original dataset shape:", dim(df), "\n")
head(df)

In [None]:
# Check for missing values
cat("Missing values per column:\n")
colSums(is.na(df))

# Remove missing values
df <- na.omit(df)
cat("\nDataset shape after removing missing values:", dim(df), "\n")

In [None]:
# Create binary variable y (1 if heart disease, 0 otherwise)
df$y <- ifelse(df$hd > 0, 1, 0)
cat("Distribution of target variable:\n")
table(df$y)

# Remove the original hd column
df$hd <- NULL

In [None]:
# Convert categorical variables to factors
categorical_vars <- c('sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal')
df[categorical_vars] <- lapply(df[categorical_vars], as.factor)

# Convert y to factor for classification
df$y <- as.factor(df$y)

cat("Dataset structure:\n")
str(df)

## 1.2 Data Analysis (8 points)

### (1 point) Split data and plot classification tree

In [None]:
# Split the data into training and test sets
set.seed(123)
train_index <- createDataPartition(df$y, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

cat("Training set size:", nrow(train_data), "\n")
cat("Test set size:", nrow(test_data), "\n")

In [None]:
# Train a classification tree without pruning
tree_model <- rpart(y ~ ., data = train_data, method = "class")

# Plot the classification tree
png('../output/classification_tree_before_pruning_R.png', width = 1200, height = 800)
rpart.plot(tree_model, main = "Classification Tree (Before Pruning)",
           extra = 104, box.palette = "RdBu", shadow.col = "gray")
dev.off()

# Display tree info
cat("Tree complexity parameters:\n")
printcp(tree_model)

### (2 points) Plot confusion matrix and interpret results

In [None]:
# Make predictions on test set
predictions <- predict(tree_model, test_data, type = "class")

# Calculate confusion matrix
cm <- confusionMatrix(predictions, test_data$y, 
                      dnn = c("Predicted", "Actual"))
print(cm)

# Plot confusion matrix
cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Predicted", "Actual", "Freq")
cm_table$Predicted <- ifelse(cm_table$Predicted == "0", "Does not have HD", "Has HD")
cm_table$Actual <- ifelse(cm_table$Actual == "0", "Does not have HD", "Has HD")

p <- ggplot(cm_table, aes(x = Predicted, y = Actual, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), size = 8) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix (Before Pruning)",
       x = "Predicted Label", y = "True Label") +
  theme_minimal() +
  theme(text = element_text(size = 14))

ggsave('../output/confusion_matrix_before_pruning_R.png', p, width = 8, height = 6, dpi = 300)

**Interpretation:**
- The confusion matrix shows the performance of our classification model.
- True Negatives: correctly predicted individuals without heart disease
- True Positives: correctly predicted individuals with heart disease
- False Positives: incorrectly predicted as having heart disease
- False Negatives: incorrectly predicted as not having heart disease (more concerning in medical diagnosis)

### (1.5 points) Fix overfitting using cross-validation

In [None]:
# Generate 50 values of alpha (cp in rpart) equally spaced on logarithmic scale
alpha_values <- exp(seq(log(exp(-10)), log(0.05), length.out = 50))

cat("Number of alpha values:", length(alpha_values), "\n")
cat("Alpha range:", min(alpha_values), "to", max(alpha_values), "\n")

In [None]:
# Perform 4-fold cross-validation for each alpha
set.seed(123)
mean_accuracies <- numeric(length(alpha_values))

for (i in seq_along(alpha_values)) {
  # Create folds
  folds <- createFolds(train_data$y, k = 4)
  accuracies <- numeric(4)
  
  for (j in 1:4) {
    # Split into train and validation
    val_idx <- folds[[j]]
    train_cv <- train_data[-val_idx, ]
    val_cv <- train_data[val_idx, ]
    
    # Train model with current alpha
    model_cv <- rpart(y ~ ., data = train_cv, method = "class",
                      control = rpart.control(cp = alpha_values[i]))
    
    # Predict and calculate accuracy
    pred_cv <- predict(model_cv, val_cv, type = "class")
    accuracies[j] <- mean(pred_cv == val_cv$y)
  }
  
  mean_accuracies[i] <- mean(accuracies)
}

# Find optimal alpha
optimal_idx <- which.max(mean_accuracies)
optimal_alpha <- alpha_values[optimal_idx]
optimal_accuracy <- mean_accuracies[optimal_idx]

cat("Optimal alpha:", optimal_alpha, "\n")
cat("Optimal cross-validation accuracy:", optimal_accuracy, "\n")

### (1.5 points) Plot Inaccuracy Rate vs Alpha

In [None]:
# Calculate inaccuracy rate
inaccuracy_rates <- 1 - mean_accuracies

# Create data frame for plotting
plot_df <- data.frame(alpha = alpha_values, inaccuracy = inaccuracy_rates)

# Plot Inaccuracy Rate vs Alpha
p <- ggplot(plot_df, aes(x = alpha, y = inaccuracy)) +
  geom_line() +
  geom_point(size = 2) +
  geom_vline(xintercept = optimal_alpha, color = "red", linetype = "dashed",
             size = 1) +
  annotate("text", x = optimal_alpha * 2, y = max(inaccuracy_rates) * 0.9,
           label = paste("Optimal α =", round(optimal_alpha, 6)), color = "red") +
  scale_x_log10() +
  labs(title = "Inaccuracy Rate vs Alpha",
       x = "Alpha (log scale)",
       y = "Inaccuracy Rate (1 - Accuracy)") +
  theme_minimal() +
  theme(text = element_text(size = 12))

ggsave('../output/inaccuracy_vs_alpha_R.png', p, width = 10, height = 6, dpi = 300)

### (2 points) Plot pruned tree and confusion matrix with optimal alpha

In [None]:
# Train a classification tree with optimal alpha
tree_pruned <- rpart(y ~ ., data = train_data, method = "class",
                     control = rpart.control(cp = optimal_alpha))

# Plot the pruned classification tree
png('../output/classification_tree_after_pruning_R.png', width = 1200, height = 800)
rpart.plot(tree_pruned, 
           main = paste("Classification Tree (After Pruning with α =", round(optimal_alpha, 6), ")"),
           extra = 104, box.palette = "RdBu", shadow.col = "gray")
dev.off()

cat("Pruned tree complexity parameters:\n")
printcp(tree_pruned)

In [None]:
# Make predictions with pruned tree
predictions_pruned <- predict(tree_pruned, test_data, type = "class")

# Calculate confusion matrix for pruned tree
cm_pruned <- confusionMatrix(predictions_pruned, test_data$y,
                             dnn = c("Predicted", "Actual"))
print(cm_pruned)

# Plot confusion matrix
cm_table_pruned <- as.data.frame(cm_pruned$table)
colnames(cm_table_pruned) <- c("Predicted", "Actual", "Freq")
cm_table_pruned$Predicted <- ifelse(cm_table_pruned$Predicted == "0", "Does not have HD", "Has HD")
cm_table_pruned$Actual <- ifelse(cm_table_pruned$Actual == "0", "Does not have HD", "Has HD")

p <- ggplot(cm_table_pruned, aes(x = Predicted, y = Actual, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), size = 8) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix (After Pruning)",
       x = "Predicted Label", y = "True Label") +
  theme_minimal() +
  theme(text = element_text(size = 14))

ggsave('../output/confusion_matrix_after_pruning_R.png', p, width = 8, height = 6, dpi = 300)

**Discussion:**

After pruning the decision tree using the optimal alpha value obtained through 4-fold cross-validation:

1. **Tree Complexity:** The pruned tree is significantly simpler with fewer nodes and lower depth compared to the unpruned tree, reducing overfitting.

2. **Model Performance:** The pruned model may show similar or slightly different accuracy compared to the unpruned model on the test set. The key benefit is better generalization.

3. **Interpretability:** The simpler pruned tree is easier to interpret and explain, which is crucial in medical applications.

4. **Cross-validation:** The use of cross-validation helps ensure that the selected alpha parameter leads to a model that generalizes well to unseen data.

5. **Trade-off:** There's a trade-off between model complexity and performance. The optimal alpha balances this trade-off by preventing overfitting while maintaining good predictive accuracy.