In [None]:
import torch
import torch.nn as nn
import importlib
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import seaborn as sns
def fit_pca(img, pca):
    return pca.fit_transform(img)

def kmeans_clustering(number_clusters, random_state):
    return KMeans(n_clusters=number_clusters, random_state=random_state)

def dbscan_clustering(min_samples=5):
    return DBSCAN(min_samples=min_samples)

def normalization(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

def tsne_visualization(n_components=2, perplexity=30.0, learning_rate='pca', verbose=1):
    #return TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, verbose=verbose)
    return TSNE(n_components=n_components, perplexity=perplexity, verbose=verbose)

def clustering_result(kmeans):
    image_cluster_dict = {}
    for i, m in enumerate(kmeans):
        image_cluster_dict[f'{m}'] = 0 
    for i, m in enumerate(kmeans):
        image_cluster_dict[f'{m}'] += 1
    return image_cluster_dict

In [None]:
import numpy as np
train_data = np.load("train.npy", allow_pickle=True).item()
test_data = np.load("test.npy", allow_pickle=True).item()

In [None]:
data = np.array(train_data['image'])
label = np.array(train_data['label'])

In [None]:
pca = PCA(n_components=500)
pca_result = fit_pca(data, pca)
print(sum(pca.explained_variance_ratio_))
norm_result = normalization(pca_result)

In [None]:
kmeans = kmeans_clustering(number_clusters=2, random_state=1)
clustering = kmeans.fit_predict(norm_result)
cluster_dict = clustering_result(clustering)
print(cluster_dict)
print(clustering)

In [None]:
tsne = tsne_visualization()
tsne_result = tsne.fit_transform(norm_result)

In [None]:
sns.scatterplot(x=tsne_result[:, 0], y=tsne_result[:, 1], hue=clustering)