<h1> unsupervised learning using Kmeans</h1>
various imports

In [None]:
import tensorflow as tf

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

import numpy as np
from tensorflow.keras.applications import VGG16
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score




import seaborn as sns
import pickle
import os


In [None]:
# Load pre-trained VGG16 model + higher level layers

base_model = VGG16(weights='imagenet', include_top=True)


select feature extraction method

In [None]:
from utilities import VGG16_features, path_discovery, pca_extraction

function to visualize and extract features

In [None]:
# Example usage
#features = VGG16_features('images_dataset/apple fruit/image_26.jpg', base_model, layer_name='block4_conv3',visualize=False)
#print("Extracted Features Shape:", features.shape)

#Existing layers are: ['input_1', 'block1_conv1-2', 'block1_pool', 'block2_conv1-2', 'block2_pool',
#'block3_conv1-2-3', 'block3_pool', 'block4_conv1-2-3', 'block4_pool', 'block5_conv1-2-3', 'block5_pool', 'flatten', 'fc1', 'fc2', 'fc2'].

extracted_feature_list=[]

res = path_discovery('images_dataset/')

for i in res.img_paths:
    features = VGG16_features(i, base_model, layer_name='fc2',visualize=False)
    extracted_feature_list.append(features)


extracted_feature_list = np.array(extracted_feature_list)
eval_image_names = np.array(res.img_paths)
dirs_visited = np.array(res.dirs_visited)
print("features shape",extracted_feature_list.shape)

with open('features_labels.pkl', 'wb') as f:
    pickle.dump((extracted_feature_list, eval_image_names, res.labels,dirs_visited), f)

 
del res

<h2>model Kmeans training, with PCA reduced features</h2>
(labels are included for evaluation of accuracy)

In [None]:

with open('features_labels.pkl', 'rb') as f:
    extracted_feature_list, eval_image_names, labels, dirs_visited = pickle.load(f)
print("array1 features",extracted_feature_list.shape)

#feature reduction 2D using tsne (for visualization only)
tsne = TSNE(n_components=2)
pca_object=pca_extraction(extracted_feature_list,0.8)
tsne_result=tsne.fit_transform(pca_object.pca_result)

#Kmeans clustering using PCA reduced features
Kmeans= KMeans(n_clusters=9,init='k-means++')
Kmeans.fit(pca_object.pca_result)
Kmeans_labels=Kmeans.labels_

#  Visualize the clustering results
plt.figure(figsize=(20, 5))
plt.subplot(1,2,1)
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=Kmeans_labels, cmap='tab20c',s=9)
plt.title('t-SNE Visualization of Clusters')

#  Compare with true labels
plt.subplot(1,2,2)
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=labels, cmap='tab20c',s=9)
plt.title('t-SNE Visualization with True Labels')
plt.show()

#clustering result
accuracy=adjusted_rand_score(Kmeans_labels,labels)

#confusion matrix
conf_matrix = confusion_matrix(labels, Kmeans_labels)

#creating a dictionary that maps the "cluster labels" number to the most likely correct "true labels" number
row_ind, col_ind = linear_sum_assignment(-conf_matrix)
label_mapping = dict(zip(col_ind, row_ind))
mapped_Kmeans_labels = np.array([label_mapping[label] for label in Kmeans_labels])
remapped_conf_matrix = confusion_matrix(labels, mapped_Kmeans_labels)

#how often the positive predictions are correct?
precision = precision_score(labels, mapped_Kmeans_labels, average='weighted')
# can an ML model find all instances of the positive class?
recall = recall_score(labels, mapped_Kmeans_labels, average='weighted')
#ow often the model is right? (if the class numbers are unbalanced between TP and TN, the accuracy will fail)
accuracy = accuracy_score(labels, mapped_Kmeans_labels)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

#retrieve class names from the path discovery at beginning
dirs_basename= [os.path.basename(d) for d in dirs_visited]

# Visualize the remapped confusion matrix using a heatmap
sns.heatmap(remapped_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=dirs_basename[1:len(dirs_basename)],
            yticklabels=dirs_basename[1:len(dirs_basename)])
plt.xlabel('Cluster Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

missed_images=[]
missed_indices= np.where(labels != mapped_Kmeans_labels)[0]
missed_images = eval_image_names[missed_indices]

print("Misclassified Images:")
for img in missed_images:
    print(img)
if 'features' in globals():
    del features
if 'extracted_feature_list' in globals():
    del extracted_feature_list

<h2>model Kmeans evaluation</h2>

In [None]:
img_dir2='eval_dataset/'
eval_features=[]


res= path_discovery(img_dir2)

for i in res.img_paths:
    features = VGG16_features(i, base_model, layer_name='fc2',visualize=False)
    eval_features.append(features)


eval_features = np.array(eval_features)
eval_image_names = np.array(res.img_paths)
eval_image_names= [os.path.basename(d) for d in eval_image_names]


print('feature list shape', eval_features.shape)

#apply transformation matrix of PCA, previously calculated, to reduce feature vector size
eval_features=pca_object.pca.transform(eval_features)

print('reduced feature list shape', eval_features.shape)

pred_clusters=Kmeans.predict(eval_features)
#print('predicted_SVM_labels cluster',pred_clusters)

#using the mapping dictionary created in the training phase
mapped_Kmeans_labels = np.array([label_mapping[label] for label in pred_clusters])
#print('mapped predicted_SVM_labels cluster',mapped_Kmeans_labels)


remapped_conf_matrix = confusion_matrix(res.labels, mapped_Kmeans_labels)

#how often the positive predictions are correct?
precision = precision_score(res.labels, mapped_Kmeans_labels, average='weighted')
# can an ML model find all instances of the positive class?
recall = recall_score(res.labels, mapped_Kmeans_labels, average='weighted')
#ow often the model is right? (if the class numbers are unbalanced between TP and TN, the accuracy will fail)
accuracy = accuracy_score(res.labels, mapped_Kmeans_labels)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

#retrieve class names from the path discovery at beginning
dirs_basename= [os.path.basename(d) for d in res.dirs_visited]

# Visualize the remapped confusion matrix using a heatmap
sns.heatmap(remapped_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=dirs_basename[1:len(dirs_basename)],
            yticklabels=dirs_basename[1:len(dirs_basename)])
plt.xlabel('Cluster Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


for a in range(len(mapped_Kmeans_labels)):
    print('real name:',eval_image_names[a], '   predicted_SVM_labels name:', dirs_basename[mapped_Kmeans_labels[a]])


del res,features,eval_features

<h2>model SVM training</h2>
(supervised method)

In [None]:
from sklearn.svm import SVC

with open('features_labels.pkl', 'rb') as f:
    extracted_feature_list, eval_image_names, labels, dirs_visited= pickle.load(f)
    
#train 9 SVM one for each class
kernel= 'linear'
max_iterations=500

models = [SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),
          SVC(kernel=kernel, max_iter=max_iterations, probability=True),]


for i in range(9):
    models[i].fit(extracted_feature_list, labels==i+1)


model predictions on evaluation data

In [None]:
eval_features=[]
res= path_discovery(img_dir2)

for i in res.img_paths:
    features = VGG16_features(i, base_model, layer_name='fc2',visualize=False)
    eval_features.append(features)


eval_features = np.array(eval_features)
eval_image_names = np.array(res.img_paths)
eval_image_names= [os.path.basename(d) for d in eval_image_names]



predict_score=[]
for i in range(9):
    #calculate probability of all samples for each model, then take only 2nd column
    #which is the prob. that the sample BELONGS to that class (wherease [:,0] is the prob. to not belong)
    predict_score.append(models[i].predict_proba(eval_features)[:,1])


predict_score=np.asarray(predict_score)
predicted_SVM_labels=np.argmax(predict_score,axis=0) +1 #because true labels start from 1

conf_matrix1= confusion_matrix(res.labels,predicted_SVM_labels)
dirs_basename= [os.path.basename(d) for d in res.dirs_visited]
sns.heatmap(conf_matrix1, annot=True, fmt='d', cmap='Blues', 
            xticklabels=dirs_basename[1:len(dirs_basename)],
            yticklabels=dirs_basename[1:len(dirs_basename)])

plt.xlabel('svm Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

for a in range(len(predicted_SVM_labels)):
    print('real name:',eval_image_names[a], '   predicted name:', dirs_basename[predicted_SVM_labels[a]])

<h2>model DBscan training with PCA reduced features</h2>

In [None]:
with open('features_labels.pkl', 'rb') as f:
    extracted_feature_list, eval_image_names, labels, dirs_visited = pickle.load(f)

print("array1 features",extracted_feature_list.shape)

#feature reduction 2D using tsne (for visualization only)
tsne = TSNE(n_components=2)
pca_object=pca_extraction(extracted_feature_list,0.7)
tsne_result=tsne.fit_transform(pca_object.pca_result)

#DBscan clustering using PCA reduced features
dbscan=DBSCAN(eps=1.3,min_samples=5)
dbscan_labels = dbscan.fit_predict(tsne_result)

neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(tsne_result)
distances, indices = neighbors_fit.kneighbors(tsne_result)

# Sort and plot the distances
distances = np.sort(distances, axis=0)
distances = distances[:, 2]
plt.plot(distances)
plt.xlabel('Points')
plt.ylabel('Distance to 5th Nearest Neighbor')
plt.title('Elbow Method for determining eps')
plt.show()

#  Visualize the clustering results
plt.figure(figsize=(20, 5))
plt.subplot(1,2,1)
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=dbscan_labels, cmap='tab20c',s=9)
plt.title('t-SNE Visualization of Clusters')

#  Compare with true labels
plt.subplot(1,2,2)
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=labels, cmap='tab20c',s=9)
plt.title('t-SNE Visualization with True Labels')
plt.show()


