# This file is to examine the performance of MST_Clustering (Clustering)

Please read https://github.com/jakevdp/mst_clustering for further details.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

import sklearn.preprocessing

from mst_clustering import MSTClustering
from sklearn.metrics.cluster import adjusted_mutual_info_score

In [2]:
# get all the absolute path of the csv files
repo_dir = '../../datasets/datasets in csv format/clustering/'
repo_dir_ext = []

for subdir, dirs, files in os.walk(repo_dir):
    for file in files:
        if file.find('.csv') != -1:
            repo_dir_ext.append(os.path.join(subdir, file))

            
# read all the data from csv to a tensor of type [data(2d matrix), flag(1d vector)]
repo_datasets = []

for i in range(len(repo_dir_ext)):
    dataset = pd.read_csv(repo_dir_ext[i], header=None)
    dataset = np.array(dataset)
    shape = dataset.shape
    
    dataset_data = dataset[:,0:shape[1]-1]
    dataset_flag = dataset[:,shape[1]-1]
    
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(dataset_data.astype(np.float32))
    dataset_data = scaler.transform(dataset_data.astype(np.float32))
    
    dataset = [dataset_data, dataset_flag.astype(np.int32)]
    
    repo_datasets.append(dataset)
    
# print some basic info
print len(repo_datasets)
for i in range(len(repo_dir_ext)):
    print repo_dir_ext[i]

7
../../datasets/datasets in csv format/clustering/drivFaceD.csv
../../datasets/datasets in csv format/clustering/image.csv
../../datasets/datasets in csv format/clustering/libras.csv
../../datasets/datasets in csv format/clustering/maps.csv
../../datasets/datasets in csv format/clustering/motor.csv
../../datasets/datasets in csv format/clustering/pen.csv
../../datasets/datasets in csv format/clustering/prest.csv


In [3]:
"""
For each dataset, test the performance of mst_clustering
"""
# to store the AMI results
res = []

# for different datasets
for i in range(len(repo_datasets)):
    data = repo_datasets[i][0]
    flag = repo_datasets[i][1]
    
    neighbor = [2, 3, 5, 7, 10, 15, 20, 25, 50, 75]
    cutoff = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45,
               0.5, 0.6, 0.7, 0.8, 0.9, 1]
    
    sub_res = []
    
    # for different parameter
    for j in range(len(neighbor)):
        for k in range(len(cutoff)):
            model = MSTClustering(cutoff=cutoff[k], n_neighbors=neighbor[j])
            labels = model.fit_predict(data)
        
            sub_res.append(adjusted_mutual_info_score(flag,labels))
        
    res.append(np.max(sub_res))

In [4]:
# analyze the results
for i in range(len(res)):
    print("{0}. {1}".format(i+1, repo_dir_ext[i]))
    print res[i]

1. ../../datasets/datasets in csv format/clustering/drivFaceD.csv
0.108474284909
2. ../../datasets/datasets in csv format/clustering/image.csv
0.524717581555
3. ../../datasets/datasets in csv format/clustering/libras.csv
0.562132662396
4. ../../datasets/datasets in csv format/clustering/maps.csv
0.810196882658
5. ../../datasets/datasets in csv format/clustering/motor.csv
1.0
6. ../../datasets/datasets in csv format/clustering/pen.csv
0.644884154999
7. ../../datasets/datasets in csv format/clustering/prest.csv
0.512079410476
