In [1]:
import numpy as np
import pandas as pd
import sys

import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix

import pickle

In [2]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [3]:
print("\n---------DATA(X)--------")
print(X)


---------DATA(X)--------
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
print("\n-----DATA(y)-----")
print(y)


-----DATA(y)-----
[5 0 4 ... 4 5 6]


In [5]:
KMeans_lsit=[]
silhouette_list = []

for k in range (8,13):
    print(f"\n\\\\\\\\\\\\\\\\\\\\\\\\\\K={k}//////////////")
    kmeans = KMeans(n_clusters=k)
    y_pred = kmeans.fit_predict(X)
    silhouette_list.append(silhouette_score(X, kmeans.labels_))
    KMeans_lsit.append(y_pred)
    print("------LABELS-----")
    print(kmeans.labels_)
    print("\n-----PREDICT-----")
    print(y_pred)
    print("\n----CLUSTER_CENTERS----")
    print(kmeans.cluster_centers_)
    print("\n---SILHOUETTE---")
    print(silhouette_score(X, kmeans.labels_))


\\\\\\\\\\\\\K=8//////////////
------LABELS-----
[1 2 3 ... 6 1 7]

-----PREDICT-----
[1 2 3 ... 6 1 7]

----CLUSTER_CENTERS----
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

---SILHOUETTE---
0.0733936212947435

\\\\\\\\\\\\\K=9//////////////
------LABELS-----
[4 0 1 ... 6 4 8]

-----PREDICT-----
[4 0 1 ... 6 4 8]

----CLUSTER_CENTERS----
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

---SILHOUETTE---
0.05676790015204366

\\\\\\\\\\\\\K=10//////////////
------LABELS-----
[7 8 3 ... 9 4 2]

-----PREDICT-----
[7 8 3 ... 9 4 2]

----CLUSTER_CENTERS----
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

---SILHOUETTE---
0.05872938650053095

\\\\\\\\\\\\\K=11//////////

In [6]:
file_kmeans_sil_name = "kmeans_sil.pkl"

open_file = open(file_kmeans_sil_name, "wb")
pickle.dump(silhouette_list, open_file)
open_file.close()

open_file = open(file_kmeans_sil_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('\n--------------------------------------------------EXERCICE 2---------------------------------------------')
print(loaded_list)


--------------------------------------------------EXERCICE 2---------------------------------------------
[0.0733936212947435, 0.05676790015204366, 0.05872938650053095, 0.05756615706981847, 0.05816377180735471]


In [7]:
KMeans_10_conf_m = confusion_matrix(y, KMeans_lsit[2])
KMeans_10_conf_m_max_index_list = []
print("\n--------------CONFUSION_MATRIX_K=10---------------------INDEX----MAX")
for i, arr in enumerate(KMeans_10_conf_m):
    KMeans_10_conf_m_max_index_list.append(np.argmax(arr))
    print(arr,"     ", i,"     ", np.argmax(arr))

KMeans_10_conf_m_max_index_list_sorted = np.sort(list(dict.fromkeys(KMeans_10_conf_m_max_index_list)))


--------------CONFUSION_MATRIX_K=10---------------------INDEX----MAX
[   9    2  173   43 1260    4  290   74 5041    7]       0       8
[  10 4293    7    6    7 3527    8    9    0   10]       1       1
[4864  428  151  215  239  433  320  206   56   78]       2       0
[ 206  458   33  186  508   53 4598 1032   21   46]       3       6
[  30  182  170 3735  256  231    0   19    9 2192]       4       3
[   7  166   70  425 1847  266 2126 1132   60  214]       5       6
[  56  205 4441   68 1926   44   39   17   76    4]       6       2
[  53  377    4 2086   12  310    5   20   21 4405]       7       9
[  52  336   54  208  340  315 1177 4120   37  186]       8       7
[  20  267   16 3456   30   92   86   88   50 2853]       9       3


In [8]:
file_kmeans_argmax_name = "kmeans_argmax.pkl"

open_file = open(file_kmeans_argmax_name, "wb")
pickle.dump(KMeans_10_conf_m_max_index_list_sorted, open_file)
open_file.close()

open_file = open(file_kmeans_argmax_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('\n---EXERCICE 5----')
print(loaded_list)


-----EXERCICE 5----
[0 1 2 3 6 7 8 9]


In [10]:
min_dis_list = []

for i, arr in enumerate(X[:300]):
    for j, arr_to_compare in enumerate(X):
        if(j>i):
            dis = np.linalg.norm(X[i] - X[j])
            min_dis_list.append(dis)

In [20]:
min_dis_list_sorted = np.sort(min_dis_list)[:10]
file_dist_name = "dist.pkl"

open_file = open(file_dist_name, "wb")
pickle.dump(min_dis_list_sorted, open_file)
open_file.close()

open_file = open(file_dist_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('\n-------------------------EXERCICE 6-------------------------------')
print(loaded_list)


-------------------------EXERCICE 6-------------------------------
[279.26152617 304.3764117  317.5893575  328.76587414 333.45464459
 352.89800226 355.17742045 358.0740147  359.64287842 360.42474943]


In [21]:
s = (min_dis_list_sorted[0] + min_dis_list_sorted[1] + min_dis_list_sorted[2]) / 3
eps_list = []
   
index = s
while(index <= s + 0.1 * s):
    eps_list.append(index)
    index = index + 0.04 * s
    
print('\n-----------------------EPSILON LIST------------------------')
print(eps_list)


--------------------EPSILON LIST---------------------
[300.40909845916684, 312.4254623975335, 324.4418263359002]


In [24]:
dbscan_labels_list = []
for eps in eps_list:
    dbscan = DBSCAN(eps=eps)
    dbscan.fit(X)
    print(f"\n----DBSCAN_EPS={eps}----")
    print(dbscan.labels_)
    dbscan_labels_list.append(np.sort(list(dict.fromkeys(dbscan.labels_))))


----DBSCAN_EPS=300.40909845916684----
[-1 -1 -1 ... -1 -1 -1]

----DBSCAN_EPS=312.4254623975335----
[-1 -1 -1 ... -1 -1 -1]

----DBSCAN_EPS=324.4418263359002----
[-1 -1 -1 ... -1 -1 -1]


In [27]:
dbscan_labels_list_len = []
for i in dbscan_labels_list:
    dbscan_labels_list_len.append(len(i))

file_dbscan_len_name = "dbscan_len.pkl"

open_file = open(file_dbscan_len_name, "wb")
pickle.dump(dbscan_labels_list_len, open_file)
open_file.close()

open_file = open(file_dbscan_len_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('\n-------------------------EXERCICE 8-------------------------------')
print(loaded_list)


-------------------------EXERCICE 8-------------------------------
[4, 7, 22]


In [48]:
print('\n---------------------------------------------CHECKING FILES---------------------------------------------')
print('\n---------------------------------------------kmeans_sil.pkl---------------------------------------------')
print(pd.read_pickle("kmeans_sil.pkl"))
print('\n---------------------------------------------kmeans_argmax.pkl--------------------------------------------')
print(pd.read_pickle("kmeans_argmax.pkl"))
print('\n-------------------------------------------------dist.pkl-------------------------------------------------')
print(pd.read_pickle("dist.pkl"))
print('\n----------------------------------------------dbscan_len.pkl----------------------------------------------')
print(pd.read_pickle("dbscan_len.pkl"))


---------------------------------------------CHECKING FILES---------------------------------------------

---------------------------------------------kmeans_sil.pkl---------------------------------------------
[0.0733936212947435, 0.05676790015204366, 0.05872938650053095, 0.05756615706981847, 0.05816377180735471]

---------------------------------------------kmeans_argmax.pkl--------------------------------------------
[0 1 2 3 6 7 8 9]

-------------------------------------------------dist.pkl-------------------------------------------------
[279.26152617 304.3764117  317.5893575  328.76587414 333.45464459
 352.89800226 355.17742045 358.0740147  359.64287842 360.42474943]

----------------------------------------------dbscan_len.pkl----------------------------------------------
[4, 7, 22]
