In [None]:
library(readr)
library(dplyr)
library(tidyr)
library(caret)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(MLmetrics)

set.seed(123)

In [None]:
df <- read_csv("C:/Users/Matias/OneDrive/Documentos/jupyter/CausalAI-Course/data/processed.cleveland.data",
               col_names = FALSE)

colnames(df) <- c("age", "sex", "cp", "restbp", "chol", "fbs", "restecg",
                  "thalach", "exang", "oldpeak", "slope", "ca", "thal", "hd")

In [None]:
df[df == "?"] <- NA

In [None]:
df <- df %>% mutate(across(everything(), as.numeric))

df <- na.omit(df)

In [None]:
df <- df %>% mutate(y = ifelse(hd > 0, 1, 0))

In [None]:
categorical_vars <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal")

df <- df %>%
  mutate(across(all_of(categorical_vars), as.factor))

In [None]:
df <- dummyVars("~ .", data = df, fullRank = TRUE) %>%
  predict(newdata = df) %>%
  as.data.frame()

In [None]:
drop_cols <- "y"
if ("hd" %in% colnames(df)) drop_cols <- c(drop_cols, "hd")

X <- df %>% select(-all_of(drop_cols))
y <- df$y

In [None]:
train_index <- createDataPartition(y, p = 0.9, list = FALSE)
X_train <- X[train_index, ]
X_test  <- X[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

In [None]:
tree <- rpart(y_train ~ ., data = X_train, method = "class", control = rpart.control(cp = 0))

rpart.plot(tree, main = "Decision Tree (sin poda)", type = 2, extra = 104, box.palette = "Blues")

In [None]:
y_pred <- predict(tree, X_test, type = "class")
cm <- table(Predicted = y_pred, Actual = y_test)

acc_unpruned <- mean(y_pred == y_test)
cat(sprintf("Accuracy (árbol sin poda): %.3f\n", acc_unpruned))

print(cm)
confusionMatrix(as.factor(y_pred), as.factor(y_test), positive = "1")

In [None]:
alphas <- exp(seq(-10, log(0.05), length.out = 50))
inaccuracies <- numeric(length(alphas))

In [None]:
for (i in seq_along(alphas)) {
  tree_cv <- train(
    x = X_train, y = as.factor(y_train),
    method = "rpart",
    trControl = trainControl(method = "cv", number = 4),
    tuneGrid = data.frame(cp = alphas[i])
  )
  inaccuracies[i] <- 1 - max(tree_cv$results$Accuracy)
}

In [None]:
df_alpha <- data.frame(
  log_alpha = log(alphas),
  Inaccuracy = inaccuracies
)

ggplot(df_alpha, aes(x = log_alpha, y = Inaccuracy)) +
  geom_line() + geom_point() +
  labs(title = "Inaccuracy Rate vs log(alpha) — 4-fold CV",
       x = "log(alpha)", y = "1 - Accuracy") +
  theme_minimal()

In [None]:
optimal_alpha <- alphas[which.min(inaccuracies)]
cat(sprintf("Optimal alpha (cp) encontrado por CV: %.6f\n", optimal_alpha))

In [None]:
tree_pruned <- rpart(y_train ~ ., data = X_train, method = "class",
                     control = rpart.control(cp = optimal_alpha))

rpart.plot(tree_pruned, main = sprintf("Decision Tree (poda, cp=%.3g)", optimal_alpha),
           type = 2, extra = 104, box.palette = "Greens")

In [None]:
# Matriz de confusión árbol podado
y_pred_pruned <- predict(tree_pruned, X_test, type = "class")
cm_pruned <- table(Predicted = y_pred_pruned, Actual = y_test)
print(cm_pruned)

In [None]:
confusionMatrix(as.factor(y_pred_pruned), as.factor(y_test), positive = "1")

In [None]:
acc_pruned <- mean(y_pred_pruned == y_test)
cat(sprintf("Accuracy (sin poda): %.3f | Accuracy (poda): %.3f\n", acc_unpruned, acc_pruned))