Comparar KMeans "puro" com KMeans com Improved Density Canopy

Métricas erro Quadrático médio

Quando bancos de dados UCI usar?

1. Calcular MeanDis de cada elemento
2. Calcular densidade de cada elemento
3. O com maior densidade é o centro e eliminar os que não estão dentro do raio.

In [1]:
import pandas as pd
import numpy as np
from time import time

from IPython.display import display

from sklearn.cluster import KMeans

In [2]:
from sklearn.metrics.pairwise import pairwise_distances

# X shoudl be a numpy matrix, very likely sparse matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
# T1 > T2 for overlapping clusters
# T1 = Distance to centroid point to not include in other clusters
# T2 = Distance to centroid point to include in cluster
# T1 > T2 for overlapping clusters
# T1 < T2 will have points which reside in no clusters
# T1 == T2 will cause all points to reside in mutually exclusive clusters
# Distance metric can be any from here: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
# filemap may be a list of point names in their order in X. If included, row numbers from X will be replaced with names from filemap. 
 
def canopy(X, T1, T2, distance_metric='euclidean', filemap=None):
    canopies = dict()
    X1_dist = pairwise_distances(X, metric=distance_metric)
    canopy_points = set(range(X.shape[0]))
    while canopy_points:
        point = canopy_points.pop()
        i = len(canopies)
        canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
        canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
    if filemap:
        for canopy_id in canopies.keys():
            canopy = canopies.pop(canopy_id)
            canopy2 = {"c":filemap[canopy['c']], "points":list()}
            for point in canopy['points']:
                canopy2["points"].append(filemap[point])
            canopies[canopy_id] = canopy2
    return canopies

In [3]:
def euclideanDistance(vector1, vector2):
        #print(vector1)
        #print(vector2)
        return np.sqrt(np.sum(np.power(vector1-vector2, 2)))

def getDistance(row_center, row_sample):
        #print(row_center)
        row_center = np.asarray(row_center)
        #row_center = np.asarray(row_sample)
        return euclideanDistance(row_center, row_sample)

def getSquaredError(data, kmeans):
    distances = []
    for i in range(k): # Qtd de clusters
        distance = 0
        for index_labels, value_labels in enumerate(kmeans.labels_): #kmeans.labels_ possui a que cluster cada elemento pertence
            if (i == value_labels):
                #print(value_labels)
                distance = distance + getDistance(kmeans.cluster_centers_[value_labels], data.loc[index_labels].values)
        
        distances.append(distance) #Erro quadratico medio de cada cluster
    
    distances = np.asarray(distances)
    error = np.sum(distances)
    return error

In [4]:
# -------------- Density Canopy -------------- #

def MeanDis(D):
    #OBS enumerate com numpy mto mais rápido que iterrows
    n = D.shape[0]
    D = D.values
    meanDis = 0
    
    for i, row_i in enumerate(D):
        for j, row_j in enumerate(D[i+1:,]):
            
            meanDis = meanDis + euclideanDistance(row_i, row_j)
            
    meanDis = (2/(n*(n-1))) * meanDis
    
    return meanDis
def getDensitys(D): #Definition 2
    
    meanDis = MeanDis(D)
    print ("Distância média: ", meanDis)
    df = pd.DataFrame(np.nan, index=range(D.shape[0]), columns=['density']) #Criando DataFrame vazio
    #display(df.tail())
    D = D.values
    for i, row_i in enumerate(D):
        aux = 0
        for j, row_j in enumerate(D):
            if euclideanDistance(row_i, row_j) - meanDis < 0:
                aux = aux + 1
        df.loc[i] = aux
    
    #display(df.head(20))
    return df

In [5]:
files = ["soybean-small", "iris", "wine",  "segmentation", "ionosphere"]
ks = [4,3,3,7,2]
kmeansTypes = ["random", "k-means++"]

In [6]:

for kmeansType in kmeansTypes:
    print ("--------- "+ kmeansType +" test ---------")
    
    for index, file in enumerate(files):
        print ("\n----- "+file+" -----\n")
        data = pd.read_csv("datasets/"+file+".data", header=None)
        print (data.shape)

        #display(data.head())
        if file == "segmentation": #Target eh na primeira coluna
            target = data.iloc[:,0]
            data = data.iloc[:,1:]
            
            
        else: #Target na última coluna
            target = data.iloc[:,-1]
            data = data.iloc[:,:-1]
        
        
        getDensitys(data)
        clustering_times = []
        start = time()
        k = ks[index]
        kmeans = KMeans(n_clusters=k, random_state=100, init=kmeansType, n_init=1, max_iter=100).fit(data)
        
        error = getSquaredError(data, kmeans)
        
        
                #print(kmeans.labels_)
        end = time()
        T1 = error/(data.shape[0])
        T2 = error/(data.shape[0]/2)
        
        #print(canopy(data.values, T1, T2))
        print("Erro quadrático médio: ",error)
        clustering_times.append(end - start)
        print(clustering_times)
        #print(kmeans.cluster_centers_)

--------- random test ---------

----- soybean-small -----

(47, 36)
Distância média:  5.186705273500234
Erro quadrático médio:  100.72984563120583
[0.028558731079101562]

----- iris -----

(150, 5)
Distância média:  2.5437692122517013
Erro quadrático médio:  97.32592423430009
[0.03437304496765137]

----- wine -----

(178, 14)
Distância média:  17.468877956672003
Erro quadrático médio:  1172.244025611532
[0.04710268974304199]

----- segmentation -----

(210, 20)
Distância média:  185.4874269607976
Erro quadrático médio:  12560.705081729095
[0.056040287017822266]

----- ionosphere -----

(351, 35)
Distância média:  3.9918803461829198
Erro quadrático médio:  796.4666759281233
[0.06422615051269531]
--------- k-means++ test ---------

----- soybean-small -----

(47, 36)
Distância média:  5.186705273500234
Erro quadrático médio:  96.99203638192614
[0.009288311004638672]

----- iris -----

(150, 5)
Distância média:  2.5437692122517013
Erro quadrático médio:  97.34621969415683
[0.033096075057

In [7]:
#usando numpy em vez do pandas pode tornar mais rápido pra essas operações