In [None]:
kmeans_silh <- c()
agnes_silh <- c()
pam_silh <- c()
diana_silh <- c()

In [None]:
#for euclidean
for (k in 2:8){
    
  km_res <- kmeans(cluster_data_onehot, k, iter.max=100, nstart=100)
  sil_kmeans <- silhouette(km_res$cluster, dmatrix=diss_matrix_euc)
  kmeans_silh <- c(kmeans_silh, mean(sil_kmeans[,3]))
    
  pam_res <- pam(x=diss_matrix_euc, diss=TRUE, k=k)
  sil_pam <- silhouette(pam_res$clustering, dmatrix=diss_matrix_euc)
  pam_silh <- c(pam_silh, mean(sil_pam[,3]))
  
  agnes_res <- agnes(diss_matrix_euc, method="single", diss=TRUE)
  agnes_partition <- cutree(agnes_res, k=k)
  sil_agnes <- silhouette(agnes_partition, dmatrix=diss_matrix_euc)
  agnes_silh <- c(agnes_silh, mean(sil_agnes[,3]))
    
  diana_res <- diana(x=diss_matrix_euc, diss=TRUE)
  diana_partition <- cutree(diana_res, k=k)
  sil_diana <- silhouette(diana_partition, dmatrix=diss_matrix_euc)
  diana_silh <- c(diana_silh, mean(sil_diana[,3]))
}

In [None]:
options(repr.plot.width = 13, repr.plot.height = 8)
k_seq <- 2:8
silh_df <- data.frame(k=rep(k_seq, 4),
                             Silhoulette=c(kmeans_silh, pam_silh, agnes_silh, diana_silh),
                             Method=c(rep('k-means', length(kmeans_silh)),
                                      rep('PAM', length(pam_silh)),
                                      rep('AGNES', length(agnes_silh)),
                                      rep('DIANA', length(diana_silh))))

max_silh <- tapply(silh_df$Silhoulette, silh_df$Method, max)

ggplot() +
  geom_line(data=silh_df, aes(x=k, y=Silhoulette, color=Method), size=1, linetype = "solid") +
  geom_point(data=silh_df, aes(x=k, y=Silhoulette, color=Method), size=4) +
  labs(title='Silhoulette index for different clustering methods',
       y='Silhoulette index',
       x='k') +
  theme(axis.text.x = element_text(size=14),
        axis.text.y = element_text(size=14),
        axis.title = element_text(size=18),
        plot.title = element_text(hjust = 0.5, size=20),
        legend.position = c(0.90, 0.85),
        legend.text = element_text(size = 12), 
        legend.title = element_text(size = 14)) + 
  theme(legend.background = element_rect(fill = alpha('white', 0)))

print(max_silh[1])
print(max_silh[2])
print(max_silh[3])
print(max_silh[4])

# Quality assessment

In [None]:
#do usuniecia
data <- data[1:1000,]
test <- test[1:1000,]
train <- train[1:1000,]

In [None]:
n_test = nrow(test)
n_train = nrow(train)

In [None]:
labels_kmeans_test <- tail(kmeans, n=n_test)
labels_kmeans_train <- tail(kmeans, n=n_train)

labels_agnes_test <- tail(agnes, n=n_test)
labels_agnes_train <- tail(agnes, n=n_train)

labels_pam_test <- tail(pam$clustering, n=n_test)
labels_pam_train <- tail(pam$clustering, n=n_train)

labels_diana_test <- tail(diana, n=n_test)
labels_diana_train <- tail(diana, n=n_train)

In [None]:
labels_real_test <- test$income_binary
labels_real_train <- train$income_binary
labels_real <- data$income_binary

In [None]:
kmeans_acc <- compareMatchedClasses(kmeans, labels_real, method="exact")$diag %>% as.double()
kmeans_acc_test <-  compareMatchedClasses(labels_kmeans_test, labels_real_test, method="exact")$diag %>% as.double()
kmeans_acc_train <- compareMatchedClasses(labels_kmeans_train, labels_real_train, method="exact")$diag %>% as.double()

pam_acc <- compareMatchedClasses(pam$clustering, labels_real, method="exact")$diag %>% as.double()
pam_acc_test <-  compareMatchedClasses(labels_pam_test, labels_real_test, method="exact")$diag %>% as.double()
pam_acc_train <- compareMatchedClasses(labels_pam_train, labels_real_train, method="exact")$diag %>% as.double()

agnes_acc <- compareMatchedClasses(agnes, labels_real, method="exact")$diag %>% as.double()
agnes_acc_test <-  compareMatchedClasses(labels_agnes_test, labels_real_test, method="exact")$diag %>% as.double()
agnes_acc_train <- compareMatchedClasses(labels_agnes_train, labels_real_train, method="exact")$diag %>% as.double()

diana_acc <- compareMatchedClasses(diana, labels_real, method="exact")$diag %>% as.double()
diana_acc_test <-  compareMatchedClasses(labels_diana_test, labels_real_test, method="exact")$diag %>% as.double()
diana_acc_train <- compareMatchedClasses(labels_diana_train, labels_real_train, method="exact")$diag %>% as.double()

In [None]:
scores = function(pred, real){
  (tab <- table(pred, real))
  x <- matchClasses(tab, method="exact", verbose=FALSE)
  tab <- tab[names(x), x]
  rownames(tab) <- as.numeric(x)
  colnames(tab) <- as.numeric(x)
  tab <- tab[c("1", "0"), c("1", "0")]
  
  results1 <- confusionMatrix(tab, mode='prec_recall')
  results1$byClass
  return(results1$byClass)
}

In [None]:
r <- scores(labels_kmeans_test, labels_real_test)
kmeans_f1 = r["F1"]
kmeans_sensitivity = r["sensitivity"]
kmeans_precision = r["Precision"]

r <- scores(labels_pam_test, labels_real_test)
pam_f1 = r["F1"]
pam_sensitivity = r["sensitivity"]
pam_precision = r["Precisiosensitivityn"]

r <- scores(labels_agnes_test, labels_real_test)
agnes_f1 = r["F1"]
agnes_sensitivity = r["sensitivity"]
agnes_precision = r["Precision"]

r <- scores(labels_diana_test, labels_real_test)
diana_f1 = r["F1"]
diana_sensitivity = r["sensitivity"]
diana_precision = r["Precision"]

In [None]:
results.init <- data.frame(acc_full=c(kmeans_acc, pam_acc, agnes_acc, diana_acc),
                           acc_train=c(kmeans_acc_train, pam_acc_train, agnes_acc_train, diana_acc_train),
                           acc_test=c(kmeans_acc_test, pam_acc_test, agnes_acc_test, diana_acc_test),
                           F1_test=c(kmeans_f1, pam_f1, agnes_f1, diana_f1),
                           sensitivity_test=c(kmeans_recall, pam_recall, agnes_recall, diana_recall),
                           Precision_test=c(kmeans_precision, pam_precision, agnes_precision, diana_precision))

rownames(results.init) <- c('K-Means', 'PAM', 'AGNES' , 'DIANA')