In [1]:
import pandas as pd
import numpy as np
from time import time

from IPython.display import display

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances

from functools import reduce

from sklearn.metrics import jaccard_score # scikit 0.21
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import accuracy_score

In [2]:
# X shoudl be a numpy matrix, very likely sparse matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
# T1 > T2 for overlapping clusters
# T1 = Distance to centroid point to not include in other clusters
# T2 = Distance to centroid point to include in cluster
# T1 > T2 for overlapping clusters
# T1 < T2 will have points which reside in no clusters
# T1 == T2 will cause all points to reside in mutually exclusive clusters
# Distance metric can be any from here: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
# filemap may be a list of point names in their order in X. If included, row numbers from X will be replaced with names from filemap. 
 
def canopy(X, T1, T2, distance_metric='euclidean', filemap=None):
    canopies = dict()
    X1_dist = pairwise_distances(X, metric=distance_metric)
    canopy_points = set(range(X.shape[0]))
    while canopy_points:
        point = canopy_points.pop()
        i = len(canopies)
        canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
        canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
    if filemap:
        for canopy_id in canopies.keys():
            canopy = canopies.pop(canopy_id)
            canopy2 = {"c":filemap[canopy['c']], "points":list()}
            for point in canopy['points']:
                canopy2["points"].append(filemap[point])
            canopies[canopy_id] = canopy2
    return canopies

In [3]:
def euclideanDistance(vector1, vector2):
        #print(vector1)
        #print(vector2)
        return np.sqrt(np.sum(np.power(vector1-vector2, 2)))

def getDistance(row_center, row_sample):
        #print(row_center)
        row_center = np.asarray(row_center)
        #row_center = np.asarray(row_sample)
        return euclideanDistance(row_center, row_sample)

def getSquaredError(data, kmeans):
    distances = []
    for i in range(k): # Qtd de clusters
        distance = 0
        for index_labels, value_labels in enumerate(kmeans.labels_): #kmeans.labels_ possui o cluster de cada elemento
            if (i == value_labels):
                #print(value_labels)
                distance = distance + getDistance(kmeans.cluster_centers_[value_labels], data.loc[index_labels].values)
        distances.append(distance) #Erro quadratico medio de cada cluster
    distances = np.asarray(distances)
    error = np.sum(distances)
    return error

In [4]:
# -------------- Density Canopy -------------- #

# Definition 1
#OBS.: enumerate com numpy mto mais rápido que iterrows
def mean_dist(D):
    n = D.shape[0]
    D = D.values
    sum_D = np.zeros((n, n))
    for i, row_i in enumerate(D):
        for j, row_j in enumerate(D[i+1:,]):
            sum_D[i][j] = euclideanDistance(row_i, row_j)
    return (2/(n*(n-1))) * np.sum(sum_D) if n > 1 else D.sum()

# Definition 2
def get_densities(D, meanDis):
    densities = np.zeros(D.shape[0], dtype=int)
    aux_D = D.values
    for i, row_i in enumerate(aux_D):
        for j, row_j in enumerate(aux_D):
            if euclideanDistance(row_i, row_j) - meanDis < 0:
                densities[i] += 1
    return densities

#Definition 3
def cluster_dist_mean(D, densities, meanDis):
    inv_a = np.zeros(D.shape[0])
    densities_aux = densities.copy()
    for i, row_i in enumerate(D.values):
        sum_dists = 0
        for j, row_j in enumerate(D.values):
            dist = euclideanDistance(row_i, row_j)
            if dist - meanDis < 0:
                sum_dists += dist
        inv_a[i] = 0 if sum_dists == 0 else (densities_aux[i]*(densities_aux[i]-1)) / 2 * sum_dists
    return inv_a

#Definition 4
def clusters_dist(D, densities):
    s = []
    densities_aux = densities.copy()
    for i, row_i in enumerate(D.values):
        maxDist = 0
        minDist = float("inf")
        dist = 0
        flag = 1 #Se flag=0 entao min dist, se flag=1 retornar max dist
        for j, row_j in enumerate(D.values):
            dist = euclideanDistance(row_i, row_j)
            if densities_aux[j] > densities_aux[i]:
                flag = 0
                if (dist < minDist):
                    minDist = dist
            else:
                if (dist > maxDist):
                    maxDist = dist
#             if densities[j] == np.amax(densities):
#                 flag = 1
        if flag == 1:
            s.append(maxDist)
        else: # p(j) > p(i)
            s.append(minDist)
    return s

# Definition 6
def product_weight(p, a, s): # a esta como 1/a do artigo
    w = []
    for i, row_i in enumerate(p):
        w.append(p[i] * (a[i]) * s[i])
    return w
        
def getCluster(D, meanDis, index=None): 
    aux_D = D.copy()
    df = get_densities(D, meanDis)
    if (index == None): #Primeira execução pega o de maior densidade
        row_i = D.iloc[np.argmax(df)].values
        #print(np.argmax(df))
    else:
        row_i = D.iloc[index].values
        #print (index)
    cluster = []
    for j, row_j in enumerate(aux_D.values):
        if euclideanDistance(row_i, row_j) - meanDis < 0:
            cluster.append(j)
    #print (cluster)
    #auxD.drop(cluster, inplace=True)
    #display(auxD)
    return row_i, cluster #Elemento central e cluster

def removeOutliers(aux_D, densities, inv_a, s, meanDis):
    #remove elemento com densidade = 1 e que o s[i] seja maior que o raio
    outliers = []
    for i, row_i in enumerate(aux_D.values):
        if densities[i] == 1 and s[i] > meanDis:
            outliers.append(i)
    aux_D.drop(outliers, inplace=True) #removendo outliers
    aux_D.reset_index(drop=True, inplace=True)
    densities = np.delete(densities, outliers, 0)
    inv_a = np.delete(inv_a, outliers, 0)
    s = np.delete(s, outliers, 0)
    return aux_D, densities, inv_a, s

def remove_cluster(D, meanDis, idx=None):
    center, cluster = getCluster(D, meanDis, index=idx)
    D.drop(cluster, inplace=True) #removendo cluster ja identificado
    D.reset_index(drop=True, inplace=True)
    return center

def run(D):
    
    # Step 1
    meanDis = mean_dist(D)
    print ("Distancia média: ", meanDis)
    aux_D = D.copy()
    center = remove_cluster(aux_D, meanDis)
    centers = np.array([center])
    
    # Step 2
    meanDis = mean_dist(aux_D)
    densities = get_densities(aux_D, meanDis)
    inv_a = cluster_dist_mean(aux_D, densities, meanDis) #nome da variavel ALTERADO
    s = clusters_dist(aux_D, densities)
    w_set = densities * inv_a * s #ALTERADO 
    aux_index = np.argmax(w_set)
    center = remove_cluster(aux_D, meanDis, idx=aux_index)
    center = np.array([center])
    centers = np.concatenate((centers, center), axis=0)
    
    # Step 3
    while(not aux_D.empty):
        meanDis = mean_dist(aux_D)
        densities = get_densities(aux_D, meanDis)
        inv_a = cluster_dist_mean(aux_D, densities, meanDis)
        s = clusters_dist(aux_D, densities)
        aux_D, densities, inv_a, s = removeOutliers(aux_D, densities, inv_a, s, meanDis)
        clusters_distance_matrix = pairwise_distances(aux_D, centers, metric='euclidean')
        
        w_set = densities * inv_a # ALTERAR?
        print (w_set)
        for ids, row_i in enumerate(aux_D.values):
            w_set[ids] = reduce(lambda x, y: x * y, clusters_distance_matrix[ids] * w_set[ids])
        c_id = np.argmax(w_set.max())
        #print("elemento:", w_set[c_id])
        center = remove_cluster(aux_D, meanDis, idx=c_id)
        aux_center = np.array([center])
        centers = np.concatenate((centers, aux_center), axis=0)
    return centers
#     print("KS:", len(centers), centers)


In [5]:
files = ["soybean-small", "iris", "wine",  "segmentation", "ionosphere"]
ks = [4,3,3,7,2]
#kmeansTypes = ["random", "k-means++"]
kmeansTypes = ["random"]

In [6]:
for kmeansType in kmeansTypes:
    print ("--------- "+ kmeansType +" test ---------")
    for index, file in enumerate(files):
        print ("\n----- "+file+" -----\n")
        data = pd.read_csv("datasets/"+file+".data", header=None)
        print (data.shape)
        if file == "segmentation" or file == "wine": #Target eh na primeira coluna
            target = data.iloc[:,0]
            data = data.iloc[:,1:]   
        else: #Target na última coluna
            target = data.iloc[:,-1]
            data = data.iloc[:,:-1]
        centers = run(data)
        print("centros:", len(centers))
        
        clustering_times = []
        start = time()
        k = ks[index]
        kmeans = KMeans(n_clusters=k, random_state=100, init=kmeansType, n_init=1, max_iter=100).fit(data)
        #display(data)
        error = getSquaredError(data, kmeans)
        #print(kmeans.labels_)
        end = time()
        T1 = error/(data.shape[0])
        T2 = error/(data.shape[0]/2)
        
        #print(canopy(data.values, T1, T2))
        #print("Erro quadrático médio: ",error)
        clustering_times.append(end - start)
        #print(clustering_times)
        #print(kmeans.cluster_centers_)

--------- random test ---------

----- soybean-small -----

(47, 36)
Distancia média:  5.1867052735002455
[4.47213595 4.89897949 8.48528137 9.        ]
[2.44948974 2.44948974]
[0]
centros: 5

----- iris -----

(150, 5)
Distancia média:  2.543769212251672
[ 3.          7.9598995   5.29150262  6.02494813  6.32455532  1.64924225
  2.06155281  4.4         1.58745079  4.26028168  2.77128129  2.64575131
  4.41588043 18.83878977  4.53541619  2.1       ]
[8.19756061 3.70405184 2.45560583 1.23693169 1.58745079 2.61533937
 1.38564065 4.25793377 0.82462113 3.23109888]
[1.28452326 1.28452326]
[0]
centros: 6

----- wine -----

(178, 14)
Distancia média:  352.636801172232
[ 71.01359306  24.76331965  89.79975947 544.7281891   62.04655671
  83.10318405 376.95522068  66.7146041   64.94070834 514.23189166]
[ 24.76331965  76.29315566   9.9650991  103.61914447]
[4.98254955 4.98254955]
[0]
centros: 6

----- segmentation -----

(2310, 20)
Distancia média:  178.08804260605928
[82845.61850226  2987.55548303  

In [7]:
mu, sigma = 0, 0.1 
# creating a noise with the same dimension as the dataset (2,2) 
noise = np.random.normal(mu, sigma, [2,2]) 
print(noise)

[[ 0.04833514  0.09038217]
 [ 0.13501541 -0.05348696]]


u A X = 0, u Y A = 0
σ x = σ y = 2

u B X = 5, u Y B = 1
σ x = 1 , σ y = 2

u C X = 5, u C Y = −2
σ x = σ y = 1