In [None]:
import torch
import torch.nn as nn
import importlib

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
def fit_pca(img, pca):
    return pca.fit_transform(img)

def kmeans_clustering(number_clusters, random_state):
    return KMeans(n_clusters=number_clusters, random_state=random_state)

def dbscan_clustering(min_samples=5):
    return DBSCAN(min_samples=min_samples)

def normalization(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

def tsne_visualization(n_components=2, perplexity=30.0, learning_rate='pca', verbose=1):
    #return TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, verbose=verbose)
    return TSNE(n_components=n_components, perplexity=perplexity, verbose=verbose)

def clustering_result(kmeans):
    image_cluster_dict = {}
    for i, m in enumerate(kmeans):
        image_cluster_dict[f'{m}'] = 0 
    for i, m in enumerate(kmeans):
        image_cluster_dict[f'{m}'] += 1
    return image_cluster_dict

In [None]:
mod = importlib.import_module('torchvision.models')
model = mod.resnet18(pretrained=True)
model.fc = nn.Identity()

In [None]:
def normalize(data):
    min_value = torch.min(data)
    max_value = torch.max(data)
    return (data-min_value)/(max_value-min_value)

In [None]:
import os
path = 'D:/kaggle-Fruit-Dataset'
folder_lists = [os.path.join(path, folder) for folder in os.listdir(path)]
image_lists = []
for folder in folder_lists[:3]:
    image_lists.extend([os.path.join(folder, name) for name in os.listdir(folder)])

In [None]:
from PIL import Image
import numpy as np

data_lists = []
for image_path in image_lists:
    image = Image.open(image_path).convert('RGB')
    image = np.array(image)
    data_lists.append(image)


In [None]:
print(len(image_lists))

In [None]:
batch_size = 16
input = []
result = []
for idx, data in enumerate(data_lists):
    tmp = torch.FloatTensor(data)
    tmp = normalize(tmp)
    input.append(tmp.permute(2, 0, 1))
    if len(input)==batch_size or idx==len(data_lists)-1:
        input = torch.stack(input, dim=0)
        result.extend(model(input).detach().cpu().numpy())
        input = []

In [None]:
result = np.stack(result, axis=0)
print(result.shape)

In [None]:
pca = PCA(n_components=200)
pca_result = fit_pca(result, pca)
print(sum(pca.explained_variance_ratio_))
norm_result = normalization(pca_result)

In [None]:
#import matplotlib.pyplot as plt
#distortions = []
#for i in range(1, 50):
#    kmeans = kmeans_clustering(number_clusters=i, random_state=1)
#    kmeans.fit(norm_result)
#    distortions.append(kmeans.inertia_) 
#plt.plot(range(1, 50), distortions, marker='o')
#plt.xlabel('Number of clusters')
#plt.ylabel('Distortion')
#plt.show()

In [None]:
kmeans = kmeans_clustering(number_clusters=3, random_state=1)
clustering = kmeans.fit_predict(norm_result)

In [None]:
cluster_dict = clustering_result(clustering)
print(cluster_dict)

In [None]:
tsne = tsne_visualization()
tsne_result = tsne.fit_transform(norm_result)

In [None]:
import seaborn as sns

sns.scatterplot(x=tsne_result[:, 0], y=tsne_result[:, 1], hue=clustering)