# EXPERIMENTO 1: kNN + PCA vs. k y α

In [None]:
# Importamos los módulos necesarios
import numpy as np
import plotly
from plotly.graph_objs import Scatter, Layout, Bar, Figure
plotly.offline.init_notebook_mode()
from matplotlib import pyplot
import sklearn as skl
import sklearn.pipeline as sklpipilne
import sklearn.model_selection as sklms
import sklearn.decomposition as skldeco
import sklearn.neighbors as skln
import joblib as jl

import sys
sys.path.append('..\install\lib')
import mnpy as mn

import reader
import metnum

Cargamos los datos en un data frame

In [None]:
df = reader.read7Zip('../data/train.7z')
targets = df['label'].to_numpy()
# La primer columna es el dígito correspondiente de cada imagen (de nombre 'label')
# El resto son los 28*28 pixeles de cada imagen
images = df.iloc[:, 1:].to_numpy()

Experimentamos variando los parámetros de KNN y PCA:

In [None]:
training_dataset = images[0:10000]
training_targets = targets[0:10000]

# estimators = [("pca", mn.PCA()), ("knn", skln.KNeighborsClassifier())]
estimators = [("pca", mn.PCA()), ("knn", mn.kNNClassifier())]
pipeline = sklpipilne.Pipeline(estimators)

param_grid = [
    {
        # PCA parameters
        'pca__n_components': [16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128],    # alpha
        # seleccionaremos aquellos que expliquen tanto % de la varianza total.
        'pca__iterated_power': [96, 128, 160],                                  # power method parameter
        'pca__tolerance_error': [0.0001, 0.000001],                             # power method parameter

        # KNN parameters
        'knn__k_neighbors': [1, 2, 3, 4, 5, 6, 7, 8],                          
        'knn__distance_metric': [mn.DistanceMetric.Euclidean, mn.DistanceMetric.Manhattan, mn.DistanceMetric.Chebyshev],
        'knn__weights': [mn.Weights.Uniform, mn.Weights.Distance],
    },
]

# kfold = sklms.KFold(n_splits=10, shuffle=True)
kfold = sklms.KFold(n_splits=20)

grid_search = sklms.GridSearchCV(estimator=pipeline, cv=kfold, param_grid=param_grid)
grid_search.fit(X=training_dataset, y=training_targets)
print(grid_search.best_params_)
results = grid_search.cv_results_['mean_test_score']
print(max(results))

Configuración concerniente a la figura:

In [1]:
def graph_config(legends, results):
    old_n_neighbors = legends[0]['knn__k_neighbors']
    set_of_lists_with_results = []
    list_with_results = []
    graph_names = []
    for index, legend in enumerate(legends):
        new_n_neighbors = legend['knn__k_neighbors']

        if new_n_neighbors == old_n_neighbors:
            list_with_results.append(results[index])
        else:
            graph_names.append(old_n_neighbors)
            set_of_lists_with_results.append(list_with_results)
            list_with_results = [results[index]]
            old_n_neighbors = new_n_neighbors
    graph_names.append(new_n_neighbors)
    set_of_lists_with_results.append(list_with_results)
    return graph_names, set_of_lists_with_results

def graph(N_COMPONENTS, graph_names, set_of_lists_with_results):
    traces = []
    for index, set in enumerate(set_of_lists_with_results):
        x = N_COMPONENTS
        y = set
        name = "cantidad de vecinos = " + str(graph_names[index])
        traces.append(Scatter(x=x, y=y, name=name))
    layout = Layout(
        xaxis=dict(
            title='Cantidad de vecinos',
            type='log',
            autorange=True
        ),
        yaxis=dict(
            title='% Accuracy',
            type='log',
            autorange=True
        ),
        title="Medida de performance - Accuracy"
    )
    figure = Figure(data=traces, layout=layout)
    plotly.offline.iplot(figure)

Generación del gráfico:

In [None]:
graph_names, set_of_lists_with_results = graph_config(legends, results)
graph(N_COMPONENTS, graph_names, set_of_lists_with_results)

Guardamos los parámetros obtenidos para un posterior análisis:

In [None]:
results = grid_search.cv_results_['mean_test_score']
legends = grid_search.cv_results_['params']

best_params = grid_search.best_params_
best_k_neighbors = best_params['knn__k_neighbors']
best_n_components = best_params['pca__n_components']

graph_names, set_of_lists_with_results = graph_config(legends, results)

jl.dump(grid_search, 'search_best_params.pkl', compress=True)