# Evaluation of Clustering Performance
We search for the optimal number of clusters for the Iris data set using several measures for clustering evaluation. First, we load all the libraries.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn import metrics
from sklearn import datasets
from sklearn.cluster import KMeans
%matplotlib inline

Next, we set the maximum value of k and the list of values of k that we will check.

In [2]:
random_state = 1234
max_k = 10
k_values = np.arange(2,max_k+1,1)

Load the Iris dataset

In [3]:
# import some data to play with
dataset = pd.read_csv('LoansNumerical.csv')

target = 'safe_loans'
features = dataset.columns[dataset.columns!=target]

# 
x = dataset[features]
y = dataset[target]

In [4]:
dataset.describe()

Unnamed: 0,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,last_major_derog_none,...,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment,num_term,grade_num,loan_amnt,safe_loans
count,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,...,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0,122462.0
mean,0.597511,0.123377,6.373618,15.504442,7.566317,0.211845,0.858144,0.9797,0.588493,0.873757,...,13.642497,1697.163606,71382.82,12744.445624,12511.236718,396.869141,40.861459,4.233305,12817.792254,0.622381
std,0.278943,0.328871,3.735303,7.495961,4.121514,0.66102,0.348904,1.258867,0.492109,0.332125,...,4.391458,1794.062713,58398.78,7886.861593,7941.774515,239.461992,9.645828,1.361462,7931.898196,0.782718
min,0.2,0.0,0.0,0.0,0.028895,0.0,0.0,0.0,0.0,0.0,...,5.42,0.0,1896.0,500.0,0.0,15.67,36.0,0.0,500.0,-1.0
25%,0.4,0.0,3.0,9.89,4.3668,0.0,1.0,0.0,0.0,1.0,...,10.62,550.775,44160.0,6625.0,6300.0,216.23,36.0,3.0,6725.0,1.0
50%,0.6,0.0,6.0,15.27,6.968625,0.0,1.0,1.0,1.0,1.0,...,13.48,1137.55,60000.0,11000.0,10500.0,348.18,36.0,4.0,11000.0,1.0
75%,0.8,0.0,11.0,20.85,10.2179,0.0,1.0,2.0,1.0,1.0,...,16.29,2168.83,85000.0,17450.0,17000.0,522.03,36.0,5.0,17600.0,1.0
max,1.0,1.0,11.0,39.88,43.5456,29.0,1.0,33.0,1.0,1.0,...,26.06,20758.2,7141778.0,35000.0,35000.0,1408.13,60.0,6.0,35000.0,1.0


List of the measures we will check.

In [5]:
score_funcs = [
    metrics.adjusted_rand_score,
    metrics.adjusted_mutual_info_score,
    metrics.mutual_info_score,
    metrics.calinski_harabaz_score,
    metrics.fowlkes_mallows_score,
    metrics.homogeneity_completeness_v_measure,
    metrics.silhouette_score
]

For every value of k, we apply k-means clustering and then evaluate all the metrics.

In [1]:
### generate a list for every metric
scores = {}
for score_func in score_funcs:
    if score_func!=metrics.homogeneity_completeness_v_measure:
        scores[score_func.__name__] = []
scores['homogeneity'] = []
scores['completeness'] = []
scores['v_measure'] = []

for k in k_values:
    yp = KMeans(n_clusters=k, random_state=random_state).fit_predict(x)
    for score_func in score_funcs:
        if score_func == metrics.calinski_harabaz_score:
            scores[score_func.__name__].append(score_func(x,yp))
        elif score_func == metrics.silhouette_score:
            scores[score_func.__name__].append(score_func(x,yp))
        elif score_func == metrics.homogeneity_completeness_v_measure:
            h,c,v = score_func(y,yp)
            scores['homogeneity'].append(h)
            scores['completeness'].append(c)
            scores['v_measure'].append(v)
        else:
            scores[score_func.__name__].append(score_func(y, yp))

NameError: name 'score_funcs' is not defined

Plot all the metrics.

In [None]:
f, axarr = plt.subplots(2, 1)

plt.rcParams['figure.figsize'] = (40.0, 30.0)
font = {'family' : 'sans', 'size'   : 28}
plt.rc('font', **font)

for i,s in enumerate(scores):
    axarr[int(i/3),int(i%3)].set_xlim([1,max_k])
    axarr[int(i/3),int(i%3)].set_xticks(k_values)
    axarr[int(i/3),int(i%3)].set_title("Measure - "+s)
    axarr[int(i/3),int(i%3)].plot(k_values,scores[s])
    