In [2]:
# Load necessary libraries
library(readr)
library(caret)

# Load the dataset (replace the file path with where you downloaded your file)
data <- read_csv("diabetes.csv")

# View the first few rows of the dataset
head(data)

# Set the seed for reproducibility
set.seed(123)

# Split data into 80% training and 20% testing
splitIndex <- createDataPartition(data$Outcome, p = 0.8, list = FALSE)
trainData <- data[splitIndex, ]
testData <- data[-splitIndex, ]

# Train a logistic regression model
model <- glm(Outcome ~ ., data = trainData, family = binomial)

# Summarize the model to view coefficients
summary(model)

# Make predictions on the test set
predictions <- predict(model, newdata = testData, type = "response")

# Convert predictions to binary (0 or 1)
predictions_binary <- ifelse(predictions > 0.5, 1, 0)

# Calculate accuracy
confMatrix <- confusionMatrix(as.factor(predictions_binary), as.factor(testData$Outcome))

# Print the confusion matrix and accuracy score
confMatrix


[1mRows: [22m[34m768[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1
5,116,74,0,0,25.6,0.201,30,0



Call:
glm(formula = Outcome ~ ., family = binomial, data = trainData)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -8.2216062  0.7781491 -10.566  < 2e-16 ***
Pregnancies               0.1185211  0.0357929   3.311 0.000929 ***
Glucose                   0.0352886  0.0041848   8.433  < 2e-16 ***
BloodPressure            -0.0130815  0.0057276  -2.284 0.022374 *  
SkinThickness            -0.0009780  0.0075288  -0.130 0.896648    
Insulin                  -0.0009111  0.0009841  -0.926 0.354533    
BMI                       0.0861702  0.0166338   5.180 2.21e-07 ***
DiabetesPedigreeFunction  0.7824888  0.3212008   2.436 0.014845 *  
Age                       0.0152434  0.0102944   1.481 0.138676    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 797.28  on 614  degrees of freedom
Residual deviance: 583.64  on 606  degrees of freedom


Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 91 22
         1 10 30
                                          
               Accuracy : 0.7908          
                 95% CI : (0.7178, 0.8523)
    No Information Rate : 0.6601          
    P-Value [Acc > NIR] : 0.0002786       
                                          
                  Kappa : 0.5063          
                                          
 Mcnemar's Test P-Value : 0.0518299       
                                          
            Sensitivity : 0.9010          
            Specificity : 0.5769          
         Pos Pred Value : 0.8053          
         Neg Pred Value : 0.7500          
             Prevalence : 0.6601          
         Detection Rate : 0.5948          
   Detection Prevalence : 0.7386          
      Balanced Accuracy : 0.7390          
                                          
       'Positive' Class : 0               
                                    