In [30]:
from Datos import Datos
from clasificador import ClasificadorKNN
from ClusteringKMeans import KMeans
from EstrategiaParticionado import ValidacionSimple, ValidacionCruzada
import numpy as np
import pandas as pd

# Apartado 1

In [31]:
datos_pima = Datos('data/pima-indians-diabetes.data')
datos_wdbc = Datos('data/wdbc.data')

particion = ValidacionCruzada(5)
seed = 29

ks = list(range(1,32, 3))

In [32]:
results_pima = []
for k in ks:
    result_parcial = []
    for n in [False, True]:
        error, std = ClasificadorKNN(k=k, norm=n).validacion(particion, datos_pima, seed)
        result_parcial.append('{:.6f} +/- {:.6f}'.format(error, std))
    results_pima.append(result_parcial)

In [33]:
df_pima = pd.DataFrame(results_pima,
                       columns=['Error No Normalizado', 'Error Si Normalizado'],
                       index=ks)
df_pima.index.name = 'K'
df_pima = df_pima.style.set_caption('Pima-Indian-Diabetes')

In [34]:
results_wdbc = []
for k in ks:
    result_parcial = []
    for n in [False, True]:
        error, std = ClasificadorKNN(k=k, norm=n).validacion(particion, datos_wdbc, seed)
        result_parcial.append('{:.6f} +/- {:.6f}'.format(error, std))
    results_wdbc.append(result_parcial)

In [35]:
df_wdbc = pd.DataFrame(results_wdbc,
                       columns=['Error No Normalizado', 'Error Si Normalizado'],
                       index=ks)
df_wdbc.index.name = 'K'
df_wdbc = df_wdbc.style.set_caption('WDBC')

In [36]:
display(df_pima)
display(df_wdbc)

Unnamed: 0_level_0,Error No Normalizado,Error Si Normalizado
K,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.324251 +/- 0.022724,0.289127 +/- 0.035783
4,0.287811 +/- 0.035621,0.270775 +/- 0.029587
7,0.279951 +/- 0.019289,0.256515 +/- 0.027456
10,0.261710 +/- 0.031071,0.259095 +/- 0.018736
13,0.272142 +/- 0.027723,0.252610 +/- 0.017200
16,0.256523 +/- 0.012299,0.246100 +/- 0.015686
19,0.257822 +/- 0.021327,0.242170 +/- 0.016367
22,0.248706 +/- 0.026827,0.252585 +/- 0.012184
25,0.246091 +/- 0.016074,0.247356 +/- 0.020891
28,0.263017 +/- 0.010415,0.255174 +/- 0.018560


Unnamed: 0_level_0,Error No Normalizado,Error Si Normalizado
K,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.077333 +/- 0.011662,0.051001 +/- 0.013079
4,0.072085 +/- 0.023902,0.036904 +/- 0.010220
7,0.070346 +/- 0.023730,0.029871 +/- 0.013116
10,0.070331 +/- 0.014866,0.035134 +/- 0.020739
13,0.073871 +/- 0.019993,0.038659 +/- 0.018051
16,0.079134 +/- 0.017825,0.042152 +/- 0.016966
19,0.072116 +/- 0.019855,0.042152 +/- 0.016966
22,0.075609 +/- 0.017401,0.045661 +/- 0.016957
25,0.077379 +/- 0.019805,0.047415 +/- 0.016209
28,0.077364 +/- 0.018078,0.047415 +/- 0.016209


La primera conclusión a la que se puede llegar observando ambos resultados es que la normalización de datos tiene un gran impacto (para mejor) en el rendimiento del clasificador, obteniendo de forma constante mejores resultados que sin normalizar los datos.

En cuanto a la cantidad de vecinos K considerados, depende del problema cual es el valor más óptimo para clasificar. En el caso de pima-indians-diabetes, la cantidad óptima de vecinos parece estar entre 19 y 25. En cambio, para los datos de wdbc, la cantidad de vecinos óptima es mucho más pequeña, encontrandose alrededor de 7.

# Apartado 2

In [37]:
datos_nums = Datos('data/nums.csv')

kmeans = KMeans(k=10)

clusters = kmeans.cluster(datos_nums)

In [38]:
def build_confusion_matrix(n_clusters, clusters):
    confusion_matrix = np.zeros([n_clusters, 10])
    for person in range(48):
        for dig in range(10):
            confusion_matrix[clusters[person*10 + dig]][dig] += 1
            
    df_confusion_matrix = pd.DataFrame(confusion_matrix,
                                  index=[f'Cluster {i}' for i in range(n_clusters)],
                                  columns=[f'Dígito {i}' for i in range(10)])
    return df_confusion_matrix

        
confusion_10 = build_confusion_matrix(10, clusters)
confusion_10

Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
Cluster 1,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 2,0.0,4.0,0.0,1.0,4.0,1.0,0.0,2.0,1.0,31.0
Cluster 3,0.0,3.0,2.0,40.0,0.0,29.0,2.0,0.0,1.0,0.0
Cluster 4,3.0,1.0,8.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
Cluster 5,0.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,21.0,0.0
Cluster 6,0.0,0.0,0.0,0.0,38.0,3.0,0.0,0.0,0.0,11.0
Cluster 7,1.0,39.0,11.0,5.0,2.0,10.0,14.0,6.0,5.0,3.0
Cluster 8,9.0,0.0,2.0,0.0,4.0,4.0,31.0,0.0,9.0,0.0
Cluster 9,0.0,1.0,23.0,2.0,0.0,0.0,0.0,38.0,6.0,2.0


En general, no se puede asignar cada cluster a un dígito en concreto. En algunas ocasiones, como por ejemplo el Cluster 0, este si que contiene casi únicamente datos del dígito 0. Lo mismo sucede con el Cluster 5 y el dígito 8. Aún así, otros clusters como el Cluster 3 y el Cluster 9 contienen muchos datos de varios dígitos distintos.

Los dígitos más fáciles de identificar parecen ser los dígitos 1, 3, 4 y 7, puesto que son aquellos que tienen las cuentas más altas en clusters. Es decir, la gran mayoría de sus datos pertenecen al mismo cluster. Aún así, eso no indica que el cluster que contiene todos esos datos se equivalente al dígito. El caso más claro para ver esto es el Cluster 3 con el dígito 3. 40 de los 48 datos del dígito se encuentran en ese cluster. Pero además hay 29 datos en el cluster del dígito 5. Por lo tanto se podría incluso decir que el dígito 3 y el dígito 5 se confunden entre ellos.

Es díficil sacar más conclusiones a partir de los datos ya que no se trata de una matriz de confusión real, sino que un mismo dígito puede estar repartido en varios clusters.

# Apartado 3

In [39]:
confusion_matrices = []

for i in range(11, 21):
    kmeans = KMeans(k=i)
    clusters = kmeans.cluster(datos_nums)
    confusion_matrices.append(build_confusion_matrix(i, clusters))

In [40]:
for i in range(len(confusion_matrices)):
    display(confusion_matrices[i])

Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 1,4.0,1.0,8.0,4.0,2.0,8.0,1.0,13.0,7.0,4.0
Cluster 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,2.0
Cluster 3,0.0,26.0,7.0,1.0,0.0,3.0,13.0,2.0,3.0,0.0
Cluster 4,0.0,1.0,29.0,2.0,0.0,0.0,0.0,1.0,29.0,0.0
Cluster 5,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,22.0
Cluster 6,0.0,0.0,3.0,4.0,0.0,1.0,2.0,0.0,5.0,0.0
Cluster 7,0.0,3.0,0.0,37.0,0.0,23.0,0.0,0.0,0.0,0.0
Cluster 8,12.0,0.0,1.0,0.0,0.0,9.0,31.0,0.0,1.0,0.0
Cluster 9,1.0,0.0,0.0,0.0,34.0,3.0,0.0,0.0,1.0,10.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,17.0,0.0
Cluster 1,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 2,0.0,5.0,0.0,0.0,3.0,0.0,0.0,1.0,1.0,29.0
Cluster 3,0.0,4.0,28.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
Cluster 4,1.0,0.0,6.0,0.0,0.0,0.0,0.0,10.0,6.0,1.0
Cluster 5,0.0,0.0,0.0,0.0,0.0,1.0,30.0,0.0,0.0,0.0
Cluster 6,0.0,3.0,0.0,42.0,0.0,28.0,0.0,0.0,2.0,0.0
Cluster 7,25.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,9.0,1.0
Cluster 8,0.0,0.0,0.0,1.0,33.0,2.0,0.0,0.0,0.0,11.0
Cluster 9,0.0,22.0,5.0,2.0,1.0,7.0,7.0,2.0,4.0,2.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,0.0,4.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 1,0.0,32.0,4.0,3.0,4.0,4.0,6.0,2.0,0.0,1.0
Cluster 2,0.0,0.0,3.0,7.0,0.0,2.0,2.0,0.0,4.0,1.0
Cluster 3,1.0,0.0,4.0,1.0,0.0,1.0,0.0,0.0,27.0,0.0
Cluster 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0
Cluster 5,1.0,0.0,0.0,0.0,2.0,10.0,31.0,0.0,1.0,0.0
Cluster 6,0.0,3.0,0.0,33.0,0.0,23.0,0.0,0.0,1.0,0.0
Cluster 7,0.0,7.0,0.0,0.0,2.0,0.0,0.0,1.0,2.0,27.0
Cluster 8,1.0,0.0,2.0,4.0,2.0,3.0,0.0,20.0,8.0,3.0
Cluster 9,0.0,0.0,0.0,0.0,37.0,2.0,0.0,1.0,0.0,15.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,1.0,3.0,12.0,1.0,1.0,3.0,5.0,6.0,7.0,2.0
Cluster 1,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 2,0.0,4.0,23.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0
Cluster 3,0.0,36.0,5.0,3.0,2.0,6.0,5.0,2.0,1.0,3.0
Cluster 4,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 5,0.0,0.0,6.0,0.0,0.0,1.0,0.0,1.0,15.0,0.0
Cluster 6,0.0,0.0,0.0,0.0,4.0,2.0,21.0,0.0,0.0,0.0
Cluster 7,0.0,2.0,0.0,0.0,8.0,0.0,0.0,2.0,1.0,30.0
Cluster 8,0.0,0.0,0.0,0.0,33.0,1.0,0.0,0.0,0.0,9.0
Cluster 9,0.0,1.0,0.0,31.0,0.0,17.0,0.0,0.0,1.0,0.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
Cluster 1,0.0,25.0,1.0,4.0,0.0,5.0,0.0,4.0,4.0,2.0
Cluster 2,0.0,1.0,0.0,10.0,0.0,8.0,2.0,0.0,2.0,0.0
Cluster 3,1.0,0.0,2.0,1.0,0.0,14.0,0.0,0.0,16.0,0.0
Cluster 4,0.0,0.0,0.0,0.0,34.0,1.0,0.0,0.0,0.0,6.0
Cluster 5,1.0,0.0,3.0,2.0,0.0,5.0,1.0,0.0,12.0,0.0
Cluster 6,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0
Cluster 8,0.0,0.0,0.0,28.0,0.0,10.0,0.0,0.0,0.0,0.0
Cluster 9,3.0,7.0,6.0,1.0,1.0,5.0,14.0,1.0,7.0,1.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,0.0,14.0,6.0,2.0,0.0,2.0,5.0,5.0,4.0,1.0
Cluster 1,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,10.0,2.0
Cluster 2,0.0,4.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 3,1.0,0.0,2.0,4.0,1.0,6.0,0.0,18.0,7.0,1.0
Cluster 4,0.0,2.0,0.0,38.0,0.0,12.0,0.0,0.0,0.0,1.0
Cluster 5,0.0,0.0,0.0,0.0,15.0,1.0,0.0,1.0,1.0,10.0
Cluster 6,8.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0
Cluster 7,0.0,1.0,0.0,1.0,0.0,18.0,7.0,0.0,3.0,0.0
Cluster 8,0.0,21.0,4.0,0.0,1.0,2.0,6.0,1.0,0.0,2.0
Cluster 9,23.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,0.0,4.0,17.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
Cluster 1,0.0,0.0,19.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
Cluster 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.0,3.0
Cluster 3,0.0,3.0,0.0,0.0,6.0,2.0,0.0,2.0,1.0,18.0
Cluster 4,5.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,13.0,3.0
Cluster 5,0.0,0.0,0.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0
Cluster 6,23.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
Cluster 7,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 8,0.0,23.0,1.0,2.0,0.0,3.0,0.0,0.0,3.0,1.0
Cluster 9,0.0,0.0,0.0,29.0,0.0,13.0,0.0,0.0,0.0,0.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,0.0,0.0,0.0,0.0,4.0,6.0,11.0,0.0,2.0,0.0
Cluster 1,0.0,3.0,0.0,0.0,2.0,1.0,0.0,4.0,7.0,10.0
Cluster 2,0.0,15.0,5.0,1.0,0.0,2.0,4.0,3.0,2.0,1.0
Cluster 3,2.0,0.0,2.0,0.0,1.0,3.0,0.0,12.0,6.0,1.0
Cluster 4,0.0,1.0,29.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
Cluster 5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 6,0.0,0.0,0.0,14.0,0.0,14.0,0.0,0.0,7.0,1.0
Cluster 7,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,2.0,0.0
Cluster 8,0.0,3.0,0.0,28.0,0.0,8.0,0.0,0.0,1.0,0.0
Cluster 9,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,9.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,8.0,0.0
Cluster 1,0.0,2.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,26.0
Cluster 2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,3.0,2.0
Cluster 3,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 4,0.0,0.0,0.0,12.0,0.0,3.0,0.0,0.0,1.0,2.0
Cluster 5,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
Cluster 6,1.0,1.0,8.0,0.0,0.0,1.0,0.0,0.0,20.0,0.0
Cluster 7,0.0,1.0,0.0,25.0,0.0,25.0,0.0,0.0,7.0,0.0
Cluster 8,0.0,0.0,0.0,0.0,0.0,3.0,14.0,0.0,0.0,0.0
Cluster 9,26.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0


Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,22.0,4.0,1.0
Cluster 1,0.0,1.0,9.0,0.0,0.0,2.0,8.0,0.0,2.0,0.0
Cluster 2,1.0,0.0,0.0,7.0,0.0,14.0,1.0,0.0,7.0,0.0
Cluster 3,0.0,0.0,0.0,0.0,7.0,1.0,0.0,1.0,1.0,16.0
Cluster 4,0.0,14.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
Cluster 5,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,18.0
Cluster 6,0.0,4.0,15.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0
Cluster 7,0.0,0.0,0.0,0.0,4.0,0.0,20.0,0.0,2.0,0.0
Cluster 8,0.0,0.0,0.0,0.0,29.0,1.0,0.0,0.0,0.0,5.0
Cluster 9,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0


Aunque sean demasiados datos para analizar, si que podemos observar algunas tendencias al aumentar la cantidad de clusters disponibles. Estas se pueden ver claramente si nos fijamos en el dígito 3 y el 5. Originalmente, cuando utilizábamos 10 clusters, eran los dígitos que más se confundían entre ellos (en el Cluster 3 en concreto). Al aumentar la cantidad de clusters, parece que se dejan de confundir tanto estos dos dígitos, pero en realidad lo que está sucediendo es que se siguen confundiendo más o menos de la mima forma, solo que sus datos están ahora distribuidos en varios clusters. Por lo tanto, se confunden en varios clusters a la vez. Esto se ve más claramente en la última tabla. Los datos de ambos dígitos pasan ahora a confundirse sobretodo en el Cluster 2, 10 y 14 (aunque hay algunos otros clusters en los que también una cantidad pequeña de datos se confunde).

En cambio, hay otros dígitos que si se confunden menos. Por ejemplo, en la última tabla se puede ver como aunque el dígito 0 pertenece a varios clusters (11, 15, 18), en los dos primeros el dígito 0 es el único dígito identificado. El dígito 7 también se confunde menos en sus clusters mayoritarios. 

En general parece que aumentar la cantidad de clusters mejora la identificación en algunos casos, y la deja igual en otros. Por lo tanto el impacto parece ser positivo. Aunque, de nuevo, es algo complicado sacar conclusiones de estos resultados. 

# Apartado 4

In [41]:
from sklearn import neighbors
from sklearn import cluster
from sklearn import model_selection

x_pima = datos_pima.datos.to_numpy()[:, :-1]
x_wdbc = datos_wdbc.datos.to_numpy()[:, :-1]
y_pima = datos_pima.datos.to_numpy()[:,-1]
y_wdbc = datos_wdbc.datos.to_numpy()[:,-1]
x_nums = datos_nums.datos.to_numpy()[:, :-1]
y_nums = datos_nums.datos.to_numpy()[:, -1]

particion_sk = model_selection.KFold(n_splits=5, shuffle=True)

In [42]:
results_pima_sk = []

for k in ks:
    result_parcial = []
    for n in [False, True]:
        scores = []
        for train_index, test_index in particion_sk.split(x_pima):
            x = x_pima.copy()
            y = y_pima.copy()
            if n:
                # Se normalizan los datos pero únicamente con respecto de la parte de train
                temp_x = np.zeros([len(train_index), x.shape[1] + 1])
                temp_x[:, :-1] = x[train_index]
                medias, stds = Datos.calcularMediasDesv(temp_x)
                
                for i in range(x.shape[1]):
                    x[:,i] -= medias[i]
                    x[:,i] /= stds[i]                
                
            
            model = neighbors.KNeighborsClassifier(n_neighbors=k)
            model.fit(x[train_index], y[train_index])
            
            scores.append(model.score(x[test_index], y[test_index]))
            
        result_parcial.append('{:.6f} +/- {:.6f}'.format(1 - np.mean(scores), np.std(scores)))
    
    results_pima_sk.append(result_parcial)

In [43]:
df_pima_sk = pd.DataFrame(results_pima_sk,
                       columns=['Error No Normalizado', 'Error Si Normalizado'],
                       index=ks)
df_pima_sk.index.name = 'K'
df_pima_sk = df_pima_sk.style.set_caption('Pima-Indian-Diabetes')

In [44]:
results_wdbc_sk = []

for k in ks:
    result_parcial = []
    for n in [False, True]:
        scores = []
        for train_index, test_index in particion_sk.split(x_wdbc):
            x = x_wdbc.copy()
            y = y_wdbc.copy()
            if n:
                # Se normalizan los datos pero únicamente con respecto de la parte de train
                temp_x = np.zeros([len(train_index), x.shape[1] + 1])
                temp_x[:, :-1] = x[train_index]
                medias, stds = Datos.calcularMediasDesv(temp_x)
                
                for i in range(x.shape[1]):
                    x[:,i] -= medias[i]
                    x[:,i] /= stds[i]                
                
            
            model = neighbors.KNeighborsClassifier(n_neighbors=k)
            model.fit(x[train_index], y[train_index])
            
            scores.append(model.score(x[test_index], y[test_index]))
            
        result_parcial.append('{:.6f} +/- {:.6f}'.format(1 - np.mean(scores), np.std(scores)))
    
    results_wdbc_sk.append(result_parcial)

In [45]:
df_wdbc_sk = pd.DataFrame(results_wdbc_sk,
                       columns=['Error No Normalizado', 'Error Si Normalizado'],
                       index=ks)
df_wdbc_sk.index.name = 'K'
df_wdbc_sk = df_wdbc_sk.style.set_caption('WDBC')

In [46]:
display(df_pima_sk)
display(df_wdbc_sk)

Unnamed: 0_level_0,Error No Normalizado,Error Si Normalizado
K,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.342458 +/- 0.055494,0.295535 +/- 0.029793
4,0.287692 +/- 0.035971,0.275944 +/- 0.045519
7,0.286444 +/- 0.040987,0.259002 +/- 0.036858
10,0.269578 +/- 0.043166,0.268203 +/- 0.015674
13,0.263068 +/- 0.033138,0.248714 +/- 0.035013
16,0.246125 +/- 0.035298,0.257796 +/- 0.018016
19,0.256498 +/- 0.035598,0.248621 +/- 0.025449
22,0.248799 +/- 0.048004,0.255140 +/- 0.031768
25,0.264341 +/- 0.024714,0.248655 +/- 0.027420
28,0.256464 +/- 0.031075,0.242271 +/- 0.035834


Unnamed: 0_level_0,Error No Normalizado,Error Si Normalizado
K,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.080904 +/- 0.018268,0.050955 +/- 0.020282
4,0.077348 +/- 0.014154,0.036904 +/- 0.008583
7,0.071976 +/- 0.025582,0.038659 +/- 0.016257
10,0.068592 +/- 0.021993,0.036920 +/- 0.017042
13,0.068514 +/- 0.016924,0.033411 +/- 0.017058
16,0.070253 +/- 0.013439,0.047431 +/- 0.029120
19,0.070346 +/- 0.017801,0.043937 +/- 0.014679
22,0.073777 +/- 0.017955,0.047415 +/- 0.021152
25,0.075609 +/- 0.022765,0.047462 +/- 0.015328
28,0.080810 +/- 0.031998,0.047446 +/- 0.008916


Comparando los resultados de scikit-learn con los nuestros, podemos afirmar que son bastante similares. Aunque los valores de error no sean exactamente iguales, el comportamiento si lo es. La normalización de los datos tiene un impacto positivo en el rendimiento. Y además podemos observar también que los mínimos de error se encuentran alrededor de los mismos valores de K que en nuestra implementación. 

In [47]:
kmeans_sk = cluster.KMeans(n_clusters=10, init='random', copy_x=True, random_state=seed)
kmeans_sk.fit(x_nums)
clusters_sk = kmeans_sk.labels_

In [48]:
confusion_matrix_sk = build_confusion_matrix(10, clusters_sk)
display(confusion_matrix_sk)

Unnamed: 0,Dígito 0,Dígito 1,Dígito 2,Dígito 3,Dígito 4,Dígito 5,Dígito 6,Dígito 7,Dígito 8,Dígito 9
Cluster 0,1.0,1.0,8.0,0.0,0.0,1.0,0.0,3.0,25.0,0.0
Cluster 1,0.0,37.0,5.0,5.0,1.0,9.0,11.0,3.0,6.0,5.0
Cluster 2,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cluster 3,0.0,0.0,0.0,0.0,4.0,9.0,25.0,0.0,2.0,0.0
Cluster 4,2.0,0.0,4.0,1.0,1.0,0.0,0.0,38.0,5.0,3.0
Cluster 5,25.0,0.0,0.0,0.0,0.0,1.0,12.0,0.0,6.0,0.0
Cluster 6,0.0,4.0,31.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
Cluster 7,0.0,3.0,0.0,2.0,4.0,1.0,0.0,2.0,2.0,29.0
Cluster 8,0.0,0.0,0.0,0.0,38.0,3.0,0.0,0.0,0.0,11.0
Cluster 9,0.0,3.0,0.0,40.0,0.0,24.0,0.0,0.0,1.0,0.0


De nuevo, los resultados de la implementación de scikit-learn para KMeans son muy similares a los obtenidos con nuestra implementación. El comportamiento de la tabla sigue las mismas tendencias. Hay dígitos que se identifican de forma muy clara (el dígito 7 se identifica claramente con el Cluster 4), hay otros dígitos como el 3 y el 5 que se confunden en muchos de los casos. Y por norma general tampoco se puede asignar cada cluster a un dígito en concreto (solo en algunos casos como el comentado del Cluster 4).