In [55]:
library(pROC)

# Define the directory structure using file.path()
my_dir <- file.path("C:/Users/GPEL_HPENVY/Desktop/Project_Predictive_Analytics_R")

# Check if the directory exists
if (dir.exists(my_dir)) {
  # Set the working directory
  setwd(my_dir)
  cat("Working directory set to:", getwd(), "\n")
} else {
  cat("Directory does not exist:", my_dir, "\n")
}

Working directory set to: C:/Users/GPEL_HPENVY/Desktop/Project_Predictive_Analytics_R 


In [56]:
# set file path
# read csv file
#setwd('H:/GPEL_HPENVY/Documents/UNSW/07_ZZBU6511/Assessment3/')
employees <- read.csv(file = 'employees2.csv')

# Modify the column names to remove unwanted characters (assuming the problematic column is named "ï..Age")
colnames(employees) <- gsub("ï\\.{2}", "", colnames(employees))

In [57]:
# Inspect file
head(employees)


Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobRole,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,HighPerformance
26,No,Travel_Frequently,Research & Development,10,3,3,Male,4,Laboratory Technician,...,1,1,8,3,3,7,7,7,7,0
33,No,Travel_Rarely,Research & Development,13,5,4,Male,3,Healthcare Representative,...,1,0,14,6,3,0,0,0,0,0
59,No,Travel_Rarely,Research & Development,5,3,3,Female,4,Laboratory Technician,...,1,3,12,3,2,1,0,0,0,1
42,No,Travel_Rarely,Research & Development,21,3,2,Male,4,Healthcare Representative,...,3,2,10,2,2,0,0,0,0,0
24,No,Travel_Rarely,Research & Development,27,1,4,Female,2,Manufacturing Director,...,2,2,5,6,3,4,2,3,2,0
29,No,Travel_Rarely,Research & Development,37,2,3,Male,4,Research Scientist,...,3,1,10,3,3,10,7,0,7,1


In [58]:
# Add new binary variables to the employees data frame
employees$Is.Attrition = ifelse(employees$Attrition == "Yes", 1, 0)

In [59]:
# Split the data to training and testing
set.seed(57)
rows <- sample(nrow(employees), 0.8*nrow(employees))
training <- employees[rows, ]
testing <- employees[-rows, ]

In [60]:
# Making the predictions.  Don't include Over18 since all response is Yes.  Use only upto 3* stat significant.
logmod <- glm(
  formula = Is.Attrition ~ BusinessTravel +  EnvironmentSatisfaction +
    JobInvolvement + JobSatisfaction + OverTime,
  data = training,
  family = binomial()
)
# Print summary of the prediction
summary(logmod)


Call:
glm(formula = Is.Attrition ~ BusinessTravel + EnvironmentSatisfaction + 
    JobInvolvement + JobSatisfaction + OverTime, family = binomial(), 
    data = training)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6704  -0.5627  -0.4045  -0.2457   2.7264  

Coefficients:
                                Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      0.27663    0.70500   0.392 0.694772    
BusinessTravelTravel_Frequently  1.79198    0.52955   3.384 0.000715 ***
BusinessTravelTravel_Rarely      1.06411    0.50296   2.116 0.034370 *  
EnvironmentSatisfaction         -0.34251    0.09827  -3.485 0.000492 ***
JobInvolvement                  -0.65425    0.14743  -4.438 9.09e-06 ***
JobSatisfaction                 -0.42500    0.09856  -4.312 1.62e-05 ***
OverTimeYes                      1.53397    0.21990   6.976 3.04e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken

In [61]:
# Do prediction using the training parameters
# Get the probabilities from the model and add them into a new column
training$Probability <- predict(
  object = logmod,
  newdata = training,
  # To get probabilities we set the type to "response"
  type = "response"
)

In [62]:
# Compute ROC and Printing the area under the ROC curve
roc(
  formula = training$Is.Attrition ~ training$Probability,
  plot = FALSE,
  auc = TRUE
)

Setting levels: control = 0, case = 1
Setting direction: controls < cases



Call:
roc.formula(formula = training$Is.Attrition ~ training$Probability,     plot = FALSE, auc = TRUE)

Data: training$Probability in 679 controls (training$Is.Attrition 0) < 121 cases (training$Is.Attrition 1).
Area under the curve: 0.7698

In [63]:
# Turn the probabilities into predictions
training$Prediction = ifelse(training$Probability >= 0.15, 1, 0)

In [64]:
# Get the confusion matrix
conf_matrix_train <- table(training$Prediction, training$Is.Attrition)

# Show the confusion matrix
conf_matrix_train

   
      0   1
  0 482  35
  1 197  86

In [65]:
# Calculate and show the accuracy
accuracy <- sum(diag(conf_matrix_train))/sum(conf_matrix_train)
message("Accuracy: ", accuracy)

# Calculate and show the sensitivity
sensitivity <- conf_matrix_train[2,2]/sum(conf_matrix_train[,2])
message("Sensitivity: ", sensitivity)

# Calculate and show the specificity
specificity <- conf_matrix_train[1,1]/sum(conf_matrix_train[,1])
message("Specificity: ", specificity)

Accuracy: 0.71
Sensitivity: 0.710743801652893
Specificity: 0.709867452135493


In [66]:
# Testing the model - using the testing dataset and using the Prediction models
testing$ProbabilityTest <- predict(logmod, newdata = testing, type = "response")
testing$PredictionTest <- ifelse(testing$ProbabilityTest >= 0.15, 1, 0)
conf_matrix_test <- table(testing$PredictionTest, testing$Is.Attrition)
message("Confusion matrix for model 1 Final:")
conf_matrix_test

Confusion matrix for model 1 Final:


   
      0   1
  0 117   4
  1  54  25

In [67]:
# Calculate and show the accuracy
accuracy <- sum(diag(conf_matrix_test))/sum(conf_matrix_test)
message("Accuracy: ", accuracy)

# Calculate and show the sensitivity
sensitivity <- conf_matrix_test[2,2]/sum(conf_matrix_test[,2])
message("Sensitivity: ", sensitivity)

# Calculate and show the specificity
specificity <- conf_matrix_test[1,1]/sum(conf_matrix_test[,1])
message("Specificity: ", specificity)


Accuracy: 0.71
Sensitivity: 0.862068965517241
Specificity: 0.684210526315789
