# Clustering

In [1]:
import openml
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS
from sklearn_som.som import SOM
from sklearn.metrics import f1_score, silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import time

### Iris Dataset

In [2]:
iris = openml.datasets.get_dataset(61)

In [3]:
iris_x, iris_y, iris_cat_ind, iris_att_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe")

In [4]:
iris_att_names, iris_cat_ind

(['sepallength', 'sepalwidth', 'petallength', 'petalwidth'],
 [False, False, False, False])

In [5]:
iris_y.dtype

CategoricalDtype(categories=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], ordered=True)

In [6]:
iris_x.isnull().sum(), iris_y.isnull().sum()

(sepallength    0
 sepalwidth     0
 petallength    0
 petalwidth     0
 dtype: int64,
 0)

In [7]:
iris_x_nor = (iris_x-iris_x.min())/(iris_x.max()-iris_x.min())

In [8]:
iris_y_num = LabelEncoder().fit_transform(iris_y)

### Wine Dataset

In [9]:
wine = openml.datasets.get_dataset(187)

In [10]:
wine_x, wine_y, wine_cat_ind, wine_att_names = wine.get_data(
    target=wine.default_target_attribute, dataset_format="dataframe")

In [11]:
wine_att_names, wine_cat_ind

(['Alcohol',
  'Malic_acid',
  'Ash',
  'Alcalinity_of_ash',
  'Magnesium',
  'Total_phenols',
  'Flavanoids',
  'Nonflavanoid_phenols',
  'Proanthocyanins',
  'Color_intensity',
  'Hue',
  'OD280%2FOD315_of_diluted_wines',
  'Proline'],
 [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False])

In [12]:
wine_y.dtype

CategoricalDtype(categories=['1', '2', '3'], ordered=True)

In [13]:
wine_x.isnull().sum(), wine_y.isnull().sum()

(Alcohol                           0
 Malic_acid                        0
 Ash                               0
 Alcalinity_of_ash                 0
 Magnesium                         0
 Total_phenols                     0
 Flavanoids                        0
 Nonflavanoid_phenols              0
 Proanthocyanins                   0
 Color_intensity                   0
 Hue                               0
 OD280%2FOD315_of_diluted_wines    0
 Proline                           0
 dtype: int64,
 0)

In [14]:
wine_x_nor = (wine_x-wine_x.min())/(wine_x.max()-wine_x.min())
wine_y_num = LabelEncoder().fit_transform(wine_y)

### Glass Dataset

In [15]:
glass = openml.datasets.get_dataset(41)
glass_x, glass_y, glass_cat_ind, glass_att_names = glass.get_data(
    target = glass.default_target_attribute, dataset_format="dataframe")
print(glass_att_names, glass_cat_ind)
print(glass_y.dtype)
print(glass_x.isnull().sum(), glass_y.isnull().sum())

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'] [False, False, False, False, False, False, False, False, False]
category
RI    0
Na    0
Mg    0
Al    0
Si    0
K     0
Ca    0
Ba    0
Fe    0
dtype: int64 0


In [16]:
glass_x_nor = (glass_x-glass_x.min())/(glass_x.max()-glass_x.min())
glass_y_num = LabelEncoder().fit_transform(glass_y)

### Haberman Dataset

In [17]:
hman = openml.datasets.get_dataset(43)
hman_x, hman_y, hman_cat_ind, hman_att_names = hman.get_data(
    target = hman.default_target_attribute, dataset_format="dataframe")
print(hman_att_names, hman_cat_ind)
print(hman_y.dtype)
print(hman_x.isnull().sum(), hman_y.isnull().sum())

['Age_of_patient_at_time_of_operation', 'Patients_year_of_operation', 'Number_of_positive_axillary_nodes_detected'] [False, True, False]
category
Age_of_patient_at_time_of_operation           0
Patients_year_of_operation                    0
Number_of_positive_axillary_nodes_detected    0
dtype: int64 0


In [18]:
col = ['Age_of_patient_at_time_of_operation', 'Number_of_positive_axillary_nodes_detected']
hman_x_nor = (hman_x[col]-hman_x[col].min())/(hman_x[col].max()-hman_x[col].min())

In [19]:
hman_x_nor['Patients_year_of_operation'] = LabelEncoder().fit_transform(hman_x['Patients_year_of_operation'])

In [20]:
hman_y_num = LabelEncoder().fit_transform(hman_y)

### Libras_move Dataset

In [21]:
lm = openml.datasets.get_dataset(299)
lm_x, lm_y, lm_cat_ind, lm_att_names = lm.get_data(
    target = lm.default_target_attribute, dataset_format="dataframe")
print(lm_att_names, lm_cat_ind)
print(lm_y.dtype)
print(lm_x.isnull().sum().sum())
print(lm_y.isnull().sum())

['xcoord1', 'ycoord1', 'xcoord2', 'ycoord2', 'xcoord3', 'ycoord3', 'xcoord4', 'ycoord4', 'xcoord5', 'ycoord5', 'xcoord6', 'ycoord6', 'xcoord7', 'ycoord7', 'xcoord8', 'ycoord8', 'xcoord9', 'ycoord9', 'xcoord10', 'ycoord10', 'xcoord11', 'ycoord11', 'xcoord12', 'ycoord12', 'xcoord13', 'ycoord13', 'xcoord14', 'ycoord14', 'xcoord15', 'ycoord15', 'xcoord16', 'ycoord16', 'xcoord17', 'ycoord17', 'xcoord18', 'ycoord18', 'xcoord19', 'ycoord19', 'xcoord20', 'ycoord20', 'xcoord21', 'ycoord21', 'xcoord22', 'ycoord22', 'xcoord23', 'ycoord23', 'xcoord24', 'ycoord24', 'xcoord25', 'ycoord25', 'xcoord26', 'ycoord26', 'xcoord27', 'ycoord27', 'xcoord28', 'ycoord28', 'xcoord29', 'ycoord29', 'xcoord30', 'ycoord30', 'xcoord31', 'ycoord31', 'xcoord32', 'ycoord32', 'xcoord33', 'ycoord33', 'xcoord34', 'ycoord34', 'xcoord35', 'ycoord35', 'xcoord36', 'ycoord36', 'xcoord37', 'ycoord37', 'xcoord38', 'ycoord38', 'xcoord39', 'ycoord39', 'xcoord40', 'ycoord40', 'xcoord41', 'ycoord41', 'xcoord42', 'ycoord42', 'xcoord43

In [22]:
lm_x_nor = (lm_x-lm_x.min())/(lm_x.max()-lm_x.min())
lm_y_num = LabelEncoder().fit_transform(lm_y)

### Satellite_image Dataset

In [23]:
simg = openml.datasets.get_dataset(294)
simg_x, simg_y, simg_cat_ind, simg_att_names = simg.get_data(
    target = simg.default_target_attribute, dataset_format="dataframe")
print(simg_att_names, simg_cat_ind)
print(simg_y.dtype)
print(simg_x.isnull().sum())
print(simg_y.isnull().sum())

['attr1', 'attr2', 'attr3', 'attr4', 'attr5', 'attr6', 'attr7', 'attr8', 'attr9', 'attr10', 'attr11', 'attr12', 'attr13', 'attr14', 'attr15', 'attr16', 'attr17', 'attr18', 'attr19', 'attr20', 'attr21', 'attr22', 'attr23', 'attr24', 'attr25', 'attr26', 'attr27', 'attr28', 'attr29', 'attr30', 'attr31', 'attr32', 'attr33', 'attr34', 'attr35', 'attr36'] [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
uint8
attr1     0
attr2     0
attr3     0
attr4     0
attr5     0
attr6     0
attr7     0
attr8     0
attr9     0
attr10    0
attr11    0
attr12    0
attr13    0
attr14    0
attr15    0
attr16    0
attr17    0
attr18    0
attr19    0
attr20    0
attr21    0
attr22    0
attr23    0
attr24    0
attr25    0
attr26    0
attr27    0
attr28    0
attr29    0
attr30    0
attr31    0
attr32    0
attr3

In [24]:
simg_x_nor = (simg_x-simg_x.min())/(simg_x.max()-simg_x.min())
simg_y_num = LabelEncoder().fit_transform(simg_y)

### Isolet Dataset

In [25]:
isolet = openml.datasets.get_dataset(300)
isolet_x, isolet_y, isolet_cat_ind, isolet_att_names = isolet.get_data(
    target = isolet.default_target_attribute, dataset_format="dataframe")
print(isolet_att_names, isolet_cat_ind)
print(isolet_y.dtype)
print(isolet_x.isnull().sum().sum())
print(isolet_y.isnull().sum())

['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f1

In [26]:
isolet_x_nor = (isolet_x-isolet_x.min())/(isolet_x.max()-isolet_x.min())
isolet_y_num = LabelEncoder().fit_transform(isolet_y)

### Nursery Dataset

In [27]:
nursery = openml.datasets.get_dataset(26)
nursery_x, nursery_y, nursery_cat_ind, nursery_att_names = nursery.get_data(
    target = nursery.default_target_attribute, dataset_format="dataframe")
print(nursery_att_names, nursery_cat_ind)
print(nursery_y.dtype)
print(nursery_x.isnull().sum(), nursery_y.isnull().sum())

['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health'] [True, True, True, True, True, True, True, True]
category
parents     0
has_nurs    0
form        0
children    0
housing     0
finance     0
social      0
health      0
dtype: int64 0


In [28]:
nursery_x_num = nursery_x.copy()

In [29]:
for col in nursery_x.columns:
    nursery_x_num [col] = LabelEncoder().fit_transform(nursery_x[col])

In [30]:
nursery_y_num = LabelEncoder().fit_transform(nursery_y)

### Gas_drift_different_concentration Dataset

In [31]:
gd = openml.datasets.get_dataset(1477)
gd_x, gd_y, gd_cat_ind, gd_att_names = gd.get_data(
    target = gd.default_target_attribute, dataset_format="dataframe")
print(gd_att_names, gd_cat_ind)
print(gd_y.dtype)
print(gd_x.isnull().sum().sum())
print(gd_y.isnull().sum())

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129'] [False, False, False, False, False, False, False, False, False, False, Fals

In [32]:
gd_x_nor = (gd_x-gd_x.min())/(gd_x.max()-gd_x.min())
gd_y_num = LabelEncoder().fit_transform(gd_y)

### MagicTelescope Dataset

In [33]:
mts = openml.datasets.get_dataset(1120)
mts_x, mts_y, mts_cat_ind, mts_att_names = mts.get_data(
    target = mts.default_target_attribute, dataset_format="dataframe")
print(mts_att_names, mts_cat_ind)
print(mts_y.dtype)
print(mts_x.isnull().sum(), mts_y.isnull().sum())

['fLength:', 'fWidth:', 'fSize:', 'fConc:', 'fConc1:', 'fAsym:', 'fM3Long:', 'fM3Trans:', 'fAlpha:', 'fDist:'] [False, False, False, False, False, False, False, False, False, False]
category
fLength:     0
fWidth:      0
fSize:       0
fConc:       0
fConc1:      0
fAsym:       0
fM3Long:     0
fM3Trans:    0
fAlpha:      0
fDist:       0
dtype: int64 0


In [34]:
mts_x_nor = (mts_x-mts_x.min())/(mts_x.max()-mts_x.min())
mts_y_num = LabelEncoder().fit_transform(mts_y)

### Letter Datatset

In [35]:
letter = openml.datasets.get_dataset(6)
letter_x, letter_y, letter_cat_ind, letter_att_names = letter.get_data(
    target = letter.default_target_attribute, dataset_format="dataframe")
print(letter_att_names, letter_cat_ind)
print(letter_y.dtype)
print(letter_x.isnull().sum(), letter_y.isnull().sum())

['x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx'] [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
category
x-box    0
y-box    0
width    0
high     0
onpix    0
x-bar    0
y-bar    0
x2bar    0
y2bar    0
xybar    0
x2ybr    0
xy2br    0
x-ege    0
xegvy    0
y-ege    0
yegvx    0
dtype: int64 0


In [36]:
letter_x_nor = (letter_x-letter_x.min())/(letter_x.max()-letter_x.min())
letter_y_num = LabelEncoder().fit_transform(letter_y)

### Covertype Dataset

In [37]:
cover = openml.datasets.get_dataset(150)
cover_x, cover_y, cover_cat_ind, cover_att_names = cover.get_data(
    target = cover.default_target_attribute, dataset_format="dataframe")
print(cover_att_names, cover_cat_ind)
print(cover_y.dtype)
print(cover_x.isnull().sum(), cover_y.isnull().sum())

['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'] [False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, 

In [38]:
columns = cover_x.columns
col_num = columns [0:10]
col_cat = columns [10:]

In [39]:
cover_x_num = cover_x[col_cat].copy()

In [40]:
for col in col_cat :
    cover_x_num[col] = LabelEncoder().fit_transform(cover_x[col])

In [41]:
cover_x_nor = (cover_x[col_num]-cover_x[col_num].min())/(cover_x[col_num].max()-cover_x[col_num].min())
cover_x_num_nor = pd.concat([cover_x_nor, cover_x_num], axis=1, join='inner')
cover_y_num = LabelEncoder().fit_transform(cover_y)

In [42]:
set(cover_y_num)

{0, 1, 2, 3, 4, 5, 6}

## Clustering Algorithms

Datasets 
1. Iris
2. Wine
3. Haberman
4. Glass
5. Satellite Image
6. Libras Move
7. Isolet

In [43]:
Datasets = ['iris', 'wine', 'haberman', 'glass', 'satellite_image', 'libras_move', 'isolet']

### KMeans Clustering

In [44]:
def kmeans (x, y, hp):
    start_time = time.time()
    kmeans = KMeans(n_clusters = hp[0], n_init = hp[1], max_iter = hp[2])
    kmeans.fit(x)
    pred = kmeans.predict(x)
    f1score = f1_score (y, pred, average = 'weighted')
    ars = adjusted_rand_score(y, pred)
    sscore = silhouette_score(x, pred, metric='euclidean')
    execution_time = time.time() - start_time
    return pred, f1score, ars, sscore, execution_time

Hyperparameters
1. n_clusters
2. n_init
3. max_iter 

In [45]:
n_clusters1 = [2, 3, 4]
n_clusters2 = [5, 6, 7]
n_clusters3 = [14, 15, 16]
n_clusters4 = [25, 26, 27]

n_clusters = [*n_clusters1, *n_clusters2, *n_clusters3, *n_clusters4]
n_init = [5, 10, 15]
max_iter = [200, 300, 400]

hp_kmeans = pd.DataFrame({ 'n_clusters': [], 'n_init': [], 'max_iter': []})
hp_kmeans = hp_kmeans.astype(int)

In [46]:
for n in n_clusters:
    for i in n_init:
        for it in max_iter:
            hp_kmeans = hp_kmeans.append(
                {'n_clusters' : n, 'n_init' : i, 'max_iter' : it}, ignore_index = True
            )

In [47]:
hp_kmeans

Unnamed: 0,n_clusters,n_init,max_iter
0,2,5,200
1,2,5,300
2,2,5,400
3,2,10,200
4,2,10,300
...,...,...,...
103,27,10,300
104,27,10,400
105,27,15,200
106,27,15,300


In [48]:
results_iris_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(iris_x_nor, iris_y_num, hp_kmeans.iloc[i]) for i in range(0, 27))

In [51]:
results_wine_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(wine_x_nor, wine_y_num, hp_kmeans.iloc[i]) for i in range(0, 27))

In [52]:
results_hman_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(hman_x_nor, hman_y_num, hp_kmeans.iloc[i]) for i in range(0, 27))

In [54]:
results_glass_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(glass_x_nor, glass_y_num, hp_kmeans.iloc[i]) for i in range(27, 54))

In [53]:
results_simg_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(simg_x_nor, simg_y_num, hp_kmeans.iloc[i]) for i in range(27, 54))

In [55]:
results_lb_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(lm_x_nor, lm_y_num, hp_kmeans.iloc[i]) for i in range(54, 81))

In [56]:
results_isolet_kmeans = Parallel(n_jobs=-1)(delayed(kmeans)(isolet_x_nor, isolet_y_num, hp_kmeans.iloc[i]) for i in range(81, 108))

In [61]:
kmeans_results = pd.DataFrame({ 
    'Dataset': [], 
    '[n_clusters, n_init, max_iter]': [], 
    'f1 Score': [], 
    'Adjusted Random Score': [],
    'Silhouette Score': [],
    'Execution Time': []
})

In [62]:
for i in range(0, len(results_iris_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[0], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i].to_list(), 
        'f1 Score': results_iris_kmeans[i][1], 
        'Adjusted Random Score': results_iris_kmeans[i][2], 
        'Silhouette Score': results_iris_kmeans[i][3], 
        'Execution Time': results_iris_kmeans[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_wine_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[1], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i].to_list(), 
        'f1 Score': results_wine_kmeans[i][1], 
        'Adjusted Random Score': results_wine_kmeans[i][2], 
        'Silhouette Score': results_wine_kmeans[i][3], 
        'Execution Time': results_wine_kmeans[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_hman_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[2], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i].to_list(), 
        'f1 Score': results_hman_kmeans[i][1], 
        'Adjusted Random Score': results_hman_kmeans[i][2], 
        'Silhouette Score': results_hman_kmeans[i][3], 
        'Execution Time': results_hman_kmeans[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_glass_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[3], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i+27].to_list(), 
        'f1 Score': results_glass_kmeans[i][1], 
        'Adjusted Random Score': results_glass_kmeans[i][2], 
        'Silhouette Score': results_glass_kmeans[i][3], 
        'Execution Time': results_glass_kmeans[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_simg_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[4], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i+27].to_list(), 
        'f1 Score': results_simg_kmeans[i][1], 
        'Adjusted Random Score': results_simg_kmeans[i][2], 
        'Silhouette Score': results_simg_kmeans[i][3], 
        'Execution Time': results_simg_kmeans[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_lb_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[5], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i+54].to_list(), 
        'f1 Score': results_lb_kmeans[i][1], 
        'Adjusted Random Score': results_lb_kmeans[i][2], 
        'Silhouette Score': results_lb_kmeans[i][3], 
        'Execution Time': results_lb_kmeans[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_isolet_kmeans)):
    kmeans_results = kmeans_results.append({
        'Dataset': Datasets[6], 
        '[n_clusters, n_init, max_iter]': hp_kmeans.iloc[i+81].to_list(), 
        'f1 Score': results_isolet_kmeans[i][1], 
        'Adjusted Random Score': results_isolet_kmeans[i][2], 
        'Silhouette Score': results_isolet_kmeans[i][3], 
        'Execution Time': results_isolet_kmeans[i][4]
    }, ignore_index = True) 

In [63]:
kmeans_results

Unnamed: 0,Dataset,"[n_clusters, n_init, max_iter]",f1 Score,Adjusted Random Score,Silhouette Score,Execution Time
0,iris,"[2, 5, 200]",0.555556,0.568116,0.629468,0.672048
1,iris,"[2, 5, 300]",0.000000,0.568116,0.629468,0.680048
2,iris,"[2, 5, 400]",0.555556,0.568116,0.629468,0.656048
3,iris,"[2, 10, 200]",0.555556,0.568116,0.629468,0.624044
4,iris,"[2, 10, 300]",0.555556,0.568116,0.629468,0.072005
...,...,...,...,...,...,...
184,isolet,"[27, 10, 300]",0.033661,0.511896,0.081936,38.898423
185,isolet,"[27, 10, 400]",0.056242,0.522516,0.085995,40.729013
186,isolet,"[27, 15, 200]",0.035449,0.476473,0.084286,46.814413
187,isolet,"[27, 15, 300]",0.089809,0.460954,0.082389,41.518032


In [64]:
kmeans_results.to_csv('K-Means_Results.csv')

### Agglomerative Clustering

In [57]:
def aggclustering (x, y, hp):
    start_time = time.time()
    aggcluster = AgglomerativeClustering(n_clusters = hp[0], affinity = hp[1], linkage = hp[2])
    pred = aggcluster.fit_predict(x)
    f1score = f1_score (y, pred, average = 'weighted')
    ars = adjusted_rand_score(y, pred)
    sscore = silhouette_score(x, pred, metric='euclidean')
    execution_time = time.time() - start_time
    return pred, f1score, ars, sscore, execution_time

Hyperparameters
1. n_clusters 
2. affinity 
3. linkage 

In [58]:
n_clusters1 = [2, 3, 4]
n_clusters2 = [5, 6, 7]
n_clusters3 = [14, 15, 16]
n_clusters4 = [25, 26, 27]

n_clusters = [*n_clusters1, *n_clusters2, *n_clusters3, *n_clusters4]
affinity = ['euclidean', 'manhattan', 'cosine']
linkage = ['ward', 'average', 'single']

hp_agg = pd.DataFrame({ 'n_clusters': [], 'affinity': [], 'linkage': []})
hp_agg = hp_agg.astype(int)

In [59]:
for n in n_clusters:
    for link in linkage:
        if link == 'ward' :
            hp_agg = hp_agg.append(
                {'n_clusters' : n, 'affinity' : 'euclidean', 'linkage' : link}, ignore_index = True
             )
        else :
            for aff in affinity:
                hp_agg = hp_agg.append(
                    {'n_clusters' : n, 'affinity' : aff, 'linkage' : link}, ignore_index = True
                )    

In [60]:
hp_agg

Unnamed: 0,n_clusters,affinity,linkage
0,2,euclidean,ward
1,2,euclidean,average
2,2,manhattan,average
3,2,cosine,average
4,2,euclidean,single
...,...,...,...
79,27,manhattan,average
80,27,cosine,average
81,27,euclidean,single
82,27,manhattan,single


In [65]:
results_iris_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(iris_x_nor, iris_y_num, hp_agg.iloc[i]) for i in range (0, 21))

In [66]:
results_wine_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(wine_x_nor, wine_y_num, hp_agg.iloc[i]) for i in range (0, 21))

In [67]:
results_hman_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(hman_x_nor, hman_y_num, hp_agg.iloc[i]) for i in range (0, 21))

In [68]:
results_glass_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(glass_x_nor, glass_y_num, hp_agg.iloc[i]) for i in range (21, 42))

In [69]:
results_simg_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(simg_x_nor, simg_y_num, hp_agg.iloc[i]) for i in range (21, 42))

In [70]:
results_lm_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(lm_x_nor, lm_y_num, hp_agg.iloc[i]) for i in range (42, 63))

In [71]:
results_isolet_agg = Parallel(n_jobs=-1)(delayed(aggclustering)(isolet_x_nor, isolet_y_num, hp_agg.iloc[i]) for i in range (63, 84))

In [72]:
agg_results = pd.DataFrame({ 
    'Dataset': [], 
    '[n_clusters, affinity, linkage]': [], 
    'f1 Score': [], 
    'Adjusted Random Score': [],
    'Silhouette Score': [],
    'Execution Time': []
})

In [73]:
for i in range(0, len(results_iris_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[0], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i].to_list(), 
        'f1 Score': results_iris_agg[i][1], 
        'Adjusted Random Score': results_iris_agg[i][2], 
        'Silhouette Score': results_iris_agg[i][3], 
        'Execution Time': results_iris_agg[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_wine_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[1], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i].to_list(), 
        'f1 Score': results_wine_agg[i][1], 
        'Adjusted Random Score': results_wine_agg[i][2], 
        'Silhouette Score': results_wine_agg[i][3], 
        'Execution Time': results_wine_agg[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_hman_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[2], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i].to_list(), 
        'f1 Score': results_hman_agg[i][1], 
        'Adjusted Random Score': results_hman_agg[i][2], 
        'Silhouette Score': results_hman_agg[i][3], 
        'Execution Time': results_hman_agg[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_glass_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[3], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i+21].to_list(), 
        'f1 Score': results_glass_agg[i][1], 
        'Adjusted Random Score': results_glass_agg[i][2], 
        'Silhouette Score': results_glass_agg[i][3], 
        'Execution Time': results_glass_agg[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_simg_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[4], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i+21].to_list(), 
        'f1 Score': results_simg_agg[i][1], 
        'Adjusted Random Score': results_simg_agg[i][2], 
        'Silhouette Score': results_simg_agg[i][3], 
        'Execution Time': results_simg_agg[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_lm_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[5], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i+42].to_list(), 
        'f1 Score': results_lm_agg[i][1], 
        'Adjusted Random Score': results_lm_agg[i][2], 
        'Silhouette Score': results_lm_agg[i][3], 
        'Execution Time': results_lm_agg[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_isolet_agg)):
    agg_results = agg_results.append({
        'Dataset': Datasets[6], 
        '[n_clusters, affinity, linkage]': hp_agg.iloc[i+63].to_list(), 
        'f1 Score': results_isolet_agg[i][1], 
        'Adjusted Random Score': results_isolet_agg[i][2], 
        'Silhouette Score': results_isolet_agg[i][3], 
        'Execution Time': results_isolet_agg[i][4]
    }, ignore_index = True) 

In [76]:
agg_results

Unnamed: 0,Dataset,"[n_clusters, affinity, linkage]",f1 Score,Adjusted Random Score,Silhouette Score,Execution Time
0,iris,"[2, euclidean, ward]",0.000000,0.568116,0.629468,0.999876
1,iris,"[2, euclidean, average]",0.000000,0.568116,0.629468,0.997875
2,iris,"[2, manhattan, average]",0.000000,0.568116,0.629468,0.997375
3,iris,"[2, cosine, average]",0.555556,0.568116,0.629468,0.997375
4,iris,"[2, euclidean, single]",0.555556,0.568116,0.629468,0.040002
...,...,...,...,...,...,...
142,isolet,"[27, manhattan, average]",0.016737,0.235841,0.089274,78.325977
143,isolet,"[27, cosine, average]",0.006171,0.115253,0.021366,77.341166
144,isolet,"[27, euclidean, single]",0.003113,0.000009,-0.055227,69.688902
145,isolet,"[27, manhattan, single]",0.002862,0.000012,-0.043531,67.746482


In [77]:
agg_results.to_csv("AggClutering_Results.csv")

### Self Organizing Maps

In [102]:
def som (x, y, hp):
    start_time = time.time()
    X = x.to_numpy()
    dim = X.shape[1]
    som = SOM(m=int(hp[0]), n=int(hp[1]), dim = dim, lr=hp[2], sigma=hp[3], max_iter=int(hp[4]))
    pred = som.fit_predict(X)
    f1score = f1_score (y, pred, average = 'weighted')
    ars = adjusted_rand_score(y, pred)
    sscore = []
    if len(set(pred)) >= 2:
        sscore = silhouette_score(x, pred, metric='euclidean')
    execution_time = time.time() - start_time
    return pred, f1score, ars, sscore, execution_time

Hyperparameters
1. m 
2. n
3. lr
4. sigma 
5. max_iter

In [103]:
M1 = [1, 2]
N1 = [2, 3, 7]
M2 = [3, 4]
N2 = [4, 5]
M3 = [2, 3]
N3 = [9, 13]
lr = [0.8, 1.0, 1.2]
sigma = [ 0.75, 1.25]
max_iter = [100, 200]

In [104]:
hp_som = pd.DataFrame({ 'M': [], 'N': [], 'lr': [], 'sigma': [], 'max_iter': []})

In [105]:
for m in M1:
    for n in N1:
        for l in lr:
            for sig in sigma:
                for it in max_iter:
                    hp_som = hp_som.append(
                        {'M' : m, 'N' : n, 'lr' : l, 'sigma' : sig, 'max_iter' : it}, ignore_index = True
                    )
                    
for m in M2:
    for n in N2:
        for l in lr:
            for sig in sigma:
                for it in max_iter:
                    hp_som = hp_som.append(
                        {'M' : m, 'N' : n, 'lr' : l, 'sigma' : sig, 'max_iter' : it}, ignore_index = True
                    )
                    
for m in M3:
    for n in N3:
        for l in lr:
            for sig in sigma:
                for it in max_iter:
                    hp_som = hp_som.append(
                        {'M' : m, 'N' : n, 'lr' : l, 'sigma' : sig, 'max_iter' : it}, ignore_index = True
                    )

In [106]:
hp_som = hp_som.astype({"M": int, "N": int, "lr": float, "sigma": float, "max_iter": int})

In [107]:
hp_som

Unnamed: 0,M,N,lr,sigma,max_iter
0,1,2,0.8,0.75,100
1,1,2,0.8,0.75,200
2,1,2,0.8,1.25,100
3,1,2,0.8,1.25,200
4,1,2,1.0,0.75,100
...,...,...,...,...,...
163,3,13,1.0,1.25,200
164,3,13,1.2,0.75,100
165,3,13,1.2,0.75,200
166,3,13,1.2,1.25,100


In [108]:
results_iris_som = Parallel(n_jobs=-1)(delayed(som)(iris_x_nor, iris_y_num, hp_som.iloc[i]) for i in range (0, len(hp_som)))

In [109]:
results_wine_som = Parallel(n_jobs=-1)(delayed(som)(wine_x_nor, wine_y_num, hp_som.iloc[i]) for i in range (0, len(hp_som)))

In [110]:
results_hman_som = Parallel(n_jobs=-1)(delayed(som)(hman_x_nor, hman_y_num, hp_som.iloc[i]) for i in range (0, len(hp_som)))

In [111]:
results_glass_som = Parallel(n_jobs=-1)(delayed(som)(glass_x_nor, glass_y_num, hp_som.iloc[i]) for i in range (0, len(hp_som)))

In [112]:
results_simg_som = Parallel(n_jobs=-1)(delayed(som)(simg_x_nor, simg_y_num, hp_som.iloc[i]) for i in range (0, len(hp_som)))

In [113]:
results_lm_som = Parallel(n_jobs=-1)(delayed(som)(lm_x_nor, lm_y_num, hp_som.iloc[i]) for i in range (72, 120))

In [114]:
results_isolet_som = Parallel(n_jobs=-1)(delayed(som)(isolet_x_nor, isolet_y_num, hp_som.iloc[i]) for i in range (120, 168))

In [115]:
som_results = pd.DataFrame({ 
    'Dataset': [], 
    '[m, n, lr, sigma, max_iter]': [], 
    'f1 Score': [], 
    'Adjusted Random Score': [],
    'Silhouette Score': [],
    'Execution Time': []
})

In [116]:
for i in range(0, len(results_iris_som)):
    som_results = som_results.append({
        'Dataset': Datasets[0], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i].to_list(), 
        'f1 Score': results_iris_som[i][1], 
        'Adjusted Random Score': results_iris_som[i][2], 
        'Silhouette Score': results_iris_som[i][3], 
        'Execution Time': results_iris_som[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_wine_som)):
    som_results = som_results.append({
        'Dataset': Datasets[1], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i].to_list(), 
        'f1 Score': results_wine_som[i][1], 
        'Adjusted Random Score': results_wine_som[i][2], 
        'Silhouette Score': results_wine_som[i][3], 
        'Execution Time': results_wine_som[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_hman_som)):
    som_results = som_results.append({
        'Dataset': Datasets[2], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i].to_list(), 
        'f1 Score': results_hman_som[i][1], 
        'Adjusted Random Score': results_hman_som[i][2], 
        'Silhouette Score': results_hman_som[i][3], 
        'Execution Time': results_hman_som[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_glass_som)):
    som_results = som_results.append({
        'Dataset': Datasets[3], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i].to_list(), 
        'f1 Score': results_glass_som[i][1], 
        'Adjusted Random Score': results_glass_som[i][2], 
        'Silhouette Score': results_glass_som[i][3], 
        'Execution Time': results_glass_som[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_simg_som)):
    som_results = som_results.append({
        'Dataset': Datasets[4], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i].to_list(), 
        'f1 Score': results_simg_som[i][1], 
        'Adjusted Random Score': results_simg_som[i][2], 
        'Silhouette Score': results_simg_som[i][3], 
        'Execution Time': results_simg_som[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_lm_som)):
    som_results = som_results.append({
        'Dataset': Datasets[5], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i+72].to_list(), 
        'f1 Score': results_lm_som[i][1], 
        'Adjusted Random Score': results_lm_som[i][2], 
        'Silhouette Score': results_lm_som[i][3], 
        'Execution Time': results_lm_som[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_isolet_som)):
    som_results = som_results.append({
        'Dataset': Datasets[6], 
        '[m, n, lr, sigma, max_iter]': hp_som.iloc[i+120].to_list(), 
        'f1 Score': results_isolet_som[i][1], 
        'Adjusted Random Score': results_isolet_som[i][2], 
        'Silhouette Score': results_isolet_som[i][3], 
        'Execution Time': results_isolet_som[i][4]
    }, ignore_index = True) 

In [117]:
som_results

Unnamed: 0,Dataset,"[m, n, lr, sigma, max_iter]",f1 Score,Adjusted Random Score,Silhouette Score,Execution Time
0,iris,"[1.0, 2.0, 0.8, 0.75, 100.0]",0.019417,0.539922,0.618006,0.297712
1,iris,"[1.0, 2.0, 0.8, 0.75, 200.0]",0.043614,0.507349,0.595077,0.073050
2,iris,"[1.0, 2.0, 0.8, 1.25, 100.0]",0.530558,0.531229,0.614280,0.051536
3,iris,"[1.0, 2.0, 0.8, 1.25, 200.0]",0.379610,0.422774,0.470404,0.064542
4,iris,"[1.0, 2.0, 1.0, 0.75, 100.0]",0.530558,0.531229,0.614280,0.053038
...,...,...,...,...,...,...
931,isolet,"[3.0, 13.0, 1.0, 1.25, 200.0]",0.022137,0.305506,0.030889,10.541925
932,isolet,"[3.0, 13.0, 1.2, 0.75, 100.0]",0.004821,0.174957,0.067987,9.396218
933,isolet,"[3.0, 13.0, 1.2, 0.75, 200.0]",0.000394,0.285022,0.060865,10.483190
934,isolet,"[3.0, 13.0, 1.2, 1.25, 100.0]",0.027409,0.293582,0.036412,9.583937


In [118]:
som_results.to_csv("SOM_Results.csv")

### DBSCAN 

In [119]:
def dbscan (x, y, hp):
    start_time = time.time()
    dbs = DBSCAN(eps = hp[0], min_samples = hp[1])
    pred = dbs.fit_predict(x)
    f1score = f1_score (y, pred, average = 'weighted')
    ars = adjusted_rand_score(y, pred)
    sscore = []
    if len(set(pred)) >= 2:
        sscore = silhouette_score(x, pred, metric='euclidean')
    execution_time = time.time() - start_time
    return pred, f1score, ars,sscore, execution_time

Hyperparameters

1. eps
2. min_samples 

In [120]:
eps = [ 0.3, 0.4, 0.5, 0.6, 0.7 ]
min_samples = [2, 3, 4, 5, 6, 7]

hp_dbs = pd.DataFrame({'eps': [], 'min_samples': []})
hp_dbs.min_samples = hp_dbs.min_samples.astype(int)

In [121]:
for ep in eps :
    for n in min_samples :
        hp_dbs = hp_dbs.append({
            'eps' : ep,
            'min_samples' : n
        }, ignore_index  = True
        )

In [122]:
hp_dbs

Unnamed: 0,eps,min_samples
0,0.3,2.0
1,0.3,3.0
2,0.3,4.0
3,0.3,5.0
4,0.3,6.0
5,0.3,7.0
6,0.4,2.0
7,0.4,3.0
8,0.4,4.0
9,0.4,5.0


In [123]:
results_iris_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(iris_x_nor, iris_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [124]:
results_wine_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(wine_x_nor, wine_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [125]:
results_hman_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(hman_x_nor, hman_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [126]:
results_glass_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(glass_x_nor, glass_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [127]:
results_simg_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(simg_x_nor, simg_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [128]:
results_lm_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(lm_x_nor, lm_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [129]:
results_isolet_dbs = Parallel(n_jobs=-1)(delayed(dbscan)(isolet_x_nor, isolet_y_num, hp_dbs.iloc[i]) for i in range (0, len(hp_dbs)))

In [130]:
dbs_results = pd.DataFrame({ 
    'Dataset': [], 
    '[eps, min_samples]': [], 
    'f1 Score': [], 
    'Adjusted Random Score': [],
    'Silhouette Score': [],
    'Execution Time': []
})

In [131]:
for i in range(0, len(results_iris_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[0], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_iris_dbs[i][1], 
        'Adjusted Random Score': results_iris_dbs[i][2], 
        'Silhouette Score': results_iris_dbs[i][3], 
        'Execution Time': results_iris_dbs[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_wine_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[1], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_wine_dbs[i][1], 
        'Adjusted Random Score': results_wine_dbs[i][2], 
        'Silhouette Score': results_wine_dbs[i][3], 
        'Execution Time': results_wine_dbs[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_hman_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[2], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_hman_dbs[i][1], 
        'Adjusted Random Score': results_hman_dbs[i][2], 
        'Silhouette Score': results_hman_dbs[i][3], 
        'Execution Time': results_hman_dbs[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_glass_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[3], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_glass_dbs[i][1], 
        'Adjusted Random Score': results_glass_dbs[i][2], 
        'Silhouette Score': results_glass_dbs[i][3], 
        'Execution Time': results_glass_dbs[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_simg_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[4], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_simg_dbs[i][1], 
        'Adjusted Random Score': results_simg_dbs[i][2], 
        'Silhouette Score': results_simg_dbs[i][3], 
        'Execution Time': results_simg_dbs[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_lm_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[5], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_lm_dbs[i][1], 
        'Adjusted Random Score': results_lm_dbs[i][2], 
        'Silhouette Score': results_lm_dbs[i][3], 
        'Execution Time': results_lm_dbs[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_isolet_dbs)):
    dbs_results = dbs_results.append({
        'Dataset': Datasets[6], 
        '[eps, min_samples]': hp_dbs.iloc[i].to_list(), 
        'f1 Score': results_isolet_dbs[i][1], 
        'Adjusted Random Score': results_isolet_dbs[i][2], 
        'Silhouette Score': results_isolet_dbs[i][3], 
        'Execution Time': results_isolet_dbs[i][4]
    }, ignore_index = True) 

In [132]:
dbs_results

Unnamed: 0,Dataset,"[eps, min_samples]",f1 Score,Adjusted Random Score,Silhouette Score,Execution Time
0,iris,"[0.3, 2.0]",0.555556,0.568116,0.629468,0.841547
1,iris,"[0.3, 3.0]",0.555556,0.568116,0.629468,0.018013
2,iris,"[0.3, 4.0]",0.557047,0.565747,0.467662,0.014010
3,iris,"[0.3, 5.0]",0.557047,0.565747,0.467662,0.014007
4,iris,"[0.3, 6.0]",0.557047,0.565747,0.467662,0.020012
...,...,...,...,...,...,...
205,isolet,"[0.7, 3.0]",0.000000,0.000000,[],2.892051
206,isolet,"[0.7, 4.0]",0.000000,0.000000,[],2.934078
207,isolet,"[0.7, 5.0]",0.000000,0.000000,[],2.852021
208,isolet,"[0.7, 6.0]",0.000000,0.000000,[],2.829005


In [133]:
dbs_results.to_csv("DBSCAN_Results.csv")

### OPTICS Clustering

In [63]:
def optics (x, y, hp):
    start_time = time.time()
    if hp[2] != 'dbscan' :
        opt = OPTICS(min_samples = int(hp[0]), cluster_method = hp[2], p = int(hp[1]) )
    else :
        opt = OPTICS(min_samples = int(hp[0]), cluster_method = hp[2], p = int(hp[1]), eps = 0.5 )
    pred = opt.fit_predict(x)
    f1score = f1_score (y, pred, average = 'weighted')
    ars = adjusted_rand_score(y, pred)
    sscore = []
    if len(set(pred)) >= 2:
        sscore = silhouette_score(x, pred, metric='euclidean')
    execution_time = time.time() - start_time
    return pred, f1score, ars,sscore, execution_time

Hyperparameters

1. min_samples
2. p
3. cluster_method

In [64]:
min_samples = [3, 4, 5, 6, 7]
p = [1, 2]
cluster_method = ['xi', 'dbscan']

In [65]:
hp_opt = pd.DataFrame({'min_samples': [], 'p': [], 'cluster_method': []})

for mins in min_samples :
    for val in p :
        for method in cluster_method :
            hp_opt = hp_opt.append({ 
                'min_samples' : mins,
                'p' : val,
                'cluster_method' : method
                }, ignore_index  = True
             )

In [66]:
hp_opt

Unnamed: 0,min_samples,p,cluster_method
0,3.0,1.0,xi
1,3.0,1.0,dbscan
2,3.0,2.0,xi
3,3.0,2.0,dbscan
4,4.0,1.0,xi
5,4.0,1.0,dbscan
6,4.0,2.0,xi
7,4.0,2.0,dbscan
8,5.0,1.0,xi
9,5.0,1.0,dbscan


In [67]:
results_iris_opt = Parallel(n_jobs=-1)(delayed(optics)(iris_x_nor, iris_y_num, hp_opt.iloc[i]) for i in range (0, len(hp_opt)))

  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [68]:
results_wine_opt = Parallel(n_jobs=-1)(delayed(optics)(wine_x_nor, wine_y_num, hp_opt.iloc[i]) for i in range (0, len(hp_opt)))

In [69]:
results_hman_opt = Parallel(n_jobs=-1)(delayed(optics)(hman_x_nor, hman_y_num, hp_opt.iloc[i]) for i in range (0, len(hp_opt)))

  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [70]:
results_glass_opt = Parallel(n_jobs=-1)(delayed(optics)(glass_x_nor, glass_y_num, hp_opt.iloc[i]) for i in range (0, len(hp_opt)))

In [71]:
results_simg_opt = Parallel(n_jobs=-1)(delayed(optics)(simg_x_nor, simg_y_num, hp_opt.iloc[i]) for i in range (0, len(hp_opt)))

In [72]:
results_lm_opt = Parallel(n_jobs=-1)(delayed(optics)(lm_x_nor, lm_y_num, hp_opt.iloc[i]) for i in range (0, len(hp_opt)))

In [73]:
opt_results = pd.DataFrame({ 
    'Dataset': [], 
    '[min_samples, p, cluster_method]': [], 
    'f1 Score': [], 
    'Adjusted Random Score': [],
    'Silhouette Score': [],
    'Execution Time': []
})

In [74]:
for i in range(0, len(results_iris_opt)):
    opt_results = opt_results.append({
        'Dataset': Datasets[0], 
        '[min_samples, p, cluster_method]': hp_opt.iloc[i].to_list(), 
        'f1 Score': results_iris_opt[i][1], 
        'Adjusted Random Score': results_iris_opt[i][2], 
        'Silhouette Score': results_iris_opt[i][3], 
        'Execution Time': results_iris_opt[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_wine_opt)):
    opt_results = opt_results.append({
        'Dataset': Datasets[1], 
        '[min_samples, p, cluster_method]': hp_opt.iloc[i].to_list(), 
        'f1 Score': results_wine_opt[i][1], 
        'Adjusted Random Score': results_wine_opt[i][2], 
        'Silhouette Score': results_wine_opt[i][3], 
        'Execution Time': results_wine_opt[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_hman_opt)):
    opt_results = opt_results.append({
        'Dataset': Datasets[2], 
        '[min_samples, p, cluster_method]': hp_opt.iloc[i].to_list(), 
        'f1 Score': results_hman_opt[i][1], 
        'Adjusted Random Score': results_hman_opt[i][2], 
        'Silhouette Score': results_hman_opt[i][3], 
        'Execution Time': results_hman_opt[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_glass_opt)):
    opt_results = opt_results.append({
        'Dataset': Datasets[3], 
        '[min_samples, p, cluster_method]': hp_opt.iloc[i].to_list(), 
        'f1 Score': results_glass_opt[i][1], 
        'Adjusted Random Score': results_glass_opt[i][2], 
        'Silhouette Score': results_glass_opt[i][3], 
        'Execution Time': results_glass_opt[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_simg_opt)):
    opt_results = opt_results.append({
        'Dataset': Datasets[4], 
        '[min_samples, p, cluster_method]': hp_opt.iloc[i].to_list(), 
        'f1 Score': results_simg_opt[i][1], 
        'Adjusted Random Score': results_simg_opt[i][2], 
        'Silhouette Score': results_simg_opt[i][3], 
        'Execution Time': results_simg_opt[i][4]
    }, ignore_index = True) 
    
for i in range(0, len(results_lm_opt)):
    opt_results = opt_results.append({
        'Dataset': Datasets[5], 
        '[min_samples, p, cluster_method]': hp_opt.iloc[i].to_list(), 
        'f1 Score': results_lm_opt[i][1], 
        'Adjusted Random Score': results_lm_opt[i][2], 
        'Silhouette Score': results_lm_opt[i][3], 
        'Execution Time': results_lm_opt[i][4]
    }, ignore_index = True) 

In [76]:
opt_results

Unnamed: 0,Dataset,"[min_samples, p, cluster_method]",f1 Score,Adjusted Random Score,Silhouette Score,Execution Time
0,iris,"[3.0, 1.0, xi]",0.049383,0.062074,-0.152539,0.578670
1,iris,"[3.0, 1.0, dbscan]",0.555556,0.568116,0.629468,0.563403
2,iris,"[3.0, 2.0, xi]",0.049383,0.032408,-0.1426,0.479338
3,iris,"[3.0, 2.0, dbscan]",0.166667,0.000000,[],0.518249
4,iris,"[4.0, 1.0, xi]",0.137566,0.052315,-0.250188,0.448156
...,...,...,...,...,...,...
115,libras_move,"[6.0, 2.0, dbscan]",0.000000,0.003767,-0.293193,1.126745
116,libras_move,"[7.0, 1.0, xi]",0.007143,0.058434,-0.040706,1.132308
117,libras_move,"[7.0, 1.0, dbscan]",0.000000,0.000000,[],1.212583
118,libras_move,"[7.0, 2.0, xi]",0.013333,0.024293,-0.153615,1.472482


In [77]:
opt_results.to_csv("OPTICS_Results.csv")