In [None]:
# Load necessary libraries
library(pROC)
library(PRROC)
library(tidyverse)

In [None]:
# Load the dataset from the data folder
setwd("../data")
iris.data <- read.csv("iris.data.txt", header = TRUE)

In [None]:
# Filter out the "Iris-setosa" class to create a binary classification problem
iris.datalog <- iris.data[iris.data$class != "Iris-setosa", ]

In [None]:
# Transform the 'class' variable into a binary variable
# 1 for "Iris-versicolor", 0 for other classes
iris.datalog$class_binaria <- ifelse(iris.datalog$class == "Iris-versicolor", 1, 0)

In [None]:
# Rename variables for clarity
names(iris.datalog)[names(iris.datalog) == "sepalenght"] <- "sl"
names(iris.datalog)[names(iris.datalog) == "petalenght"] <- "pl"
names(iris.datalog)[names(iris.datalog) == "sepalwidth"] <- "sw"
names(iris.datalog)[names(iris.datalog) == "petalwidth"] <- "pw"

In [None]:
# Fit a refined logistic regression model with selected predictors
model2 <- glm(class_binaria ~ sw + pl + pw, data = iris.datalog, family = "binomial")
summary(model2)
# Results: AIC = 21.266, NULL DEVIANCE = 138.629, RESIDUAL DEVIANCE = 13.266 (improvement)

In [None]:
# After verifying the model, proceed to adjust the classification threshold
# Choose a lower threshold to minimize error, as the optimal threshold is unknown

threshold = 0.1

# Actual values (ground truth)
true_values = iris.datalog$class_binaria

# Predicted values based on the chosen threshold
predicted_values = as.numeric(model2$fitted.values > threshold)  # 1 if > threshold, 0 otherwise

# Confusion matrix
confusion_matrix = table(true_values, predicted_values)
confusion_matrix

In [None]:
# The confusion matrix contains:
# - True Positives (TP): 1s classified as 1s
# - True Negatives (TN): 0s classified as 0s
# - False Positives (FP): 0s classified as 1s
# - False Negatives (FN): 1s classified as 0s

# Calculate misclassification error to minimize it

# Performance metrics:
# Accuracy: Percentage of correctly classified cases
accuracy = round(sum(diag(confusion_matrix)) / sum(confusion_matrix), 2)
accuracy

# Misclassification rate: Percentage of incorrectly classified cases
misclassification_rate = round((confusion_matrix[1, 2] + confusion_matrix[2, 1]) / sum(confusion_matrix), 2)
misclassification_rate

# Sensitivity (True Positive Rate)
sensitivity = confusion_matrix[2, 2] / (confusion_matrix[2, 1] + confusion_matrix[2, 2])
sensitivity

# Specificity (True Negative Rate)
specificity = confusion_matrix[1, 1] / (confusion_matrix[1, 2] + confusion_matrix[1, 1])
specificity

# Use the ROC curve to select the best threshold
fit2 = model2$fitted.values

PRROC_obj <- roc.curve(scores.class0 = fit2, weights.class0 = as.numeric(iris.datalog$class_binaria), curve = TRUE)
plot(PRROC_obj)

# From the ROC plot, observe that the curve is nearly optimal
# The threshold of 0.1 can be kept as it minimizes error