In [2]:
import utilitarios as utils # Funciones propias
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import math

### Semilla

In [4]:
random_seed = 2022 # Number of life :)

### K-fold Cross Validation

In [5]:
def test_k_fold(min_cuts , max_cuts, max_folds, neighbors, k_fold_func):
    """
    test_k_fold realiza las pruebas experimentales para los k_folds

    :param min_cuts: Cantidad mínima de cortes al vector característico.
    :param max_cuts: Cantidad máxima de cortes al vector característico. 
    :param max_folds: Cantidad máxima de folds.
    :param neighbors: Arreglo con los números de vecinos a evaluar.
    :param k_fold_func: Puntero a la función para hallar los k_folds
    :return dataframe: DataFrame de pandas con los resultados experimentales.
    """
    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K fold', 'Neighbors',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]

    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_folds in range(2, max_folds+1):
            k_folds_data = k_fold_func(X, y, number_of_folds, random_seed)

            for neighbor in neighbors:
                data_aux = [iteration, X_length, number_of_folds, neighbor]

                clf = KNeighborsClassifier(n_neighbors=neighbor)

                error_train = 0
                bias_train = 0
                varianza_train = 0
                list_of_errors_train = []

                error_test = 0
                bias_test = 0
                varianza_test = 0
                list_of_errors_test = []

                for i in range(number_of_folds):
                    clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])

                    error_i_train = 1 - clf.score(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
                    error_i_test = 1 - clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test'])

                    error_i_train = round(error_i_train,6)
                    error_i_test = round(error_i_test,6)

                    list_of_errors_train.append(error_i_train)
                    list_of_errors_test.append(error_i_test)

                error_train = sum(list_of_errors_train) / len(list_of_errors_train)
                error_test = sum(list_of_errors_test) / len(list_of_errors_test)


                varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
                varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)

                bias_train = math.sqrt(error_train - varianza_train)
                bias_test = math.sqrt(error_test - varianza_test)


                data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
                data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

                dataframe.append(data_aux)

    dataframe_estratificado = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_estratificado

### Bootstrap

In [6]:
def test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample, neighbors):
    """
    test_bootstrap realiza las pruebas experimentales para el bootstrap validation.

    :param min_cuts: Cantidad mínima de cortes al vector característico.
    :param max_cuts: Cantidad máxima de cortes al vector característico. 
    :param max_subsets: Cantidad máxima de subsets bootstrap a evaluar.
    :param training_sample: Porcentaje de datos que se van a considerar como parte del training.
    :param neighbors: Arreglo con los números de vecinos a evaluar.
    :return dataframe: DataFrame de pandas con los resultados experimentales.
    """
    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K subsets', 'Neighbors',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]


    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_subsets in range(1, max_subsets+1):
            k_subsets_data = utils.get_bootstrap_subsets(X, y, number_of_subsets, training_sample, random_seed)

            for neighbor in neighbors:
                data_aux = [iteration, X_length, number_of_subsets, neighbor]

                clf = KNeighborsClassifier(n_neighbors=neighbor)

                error_train = 0
                bias_train = 0
                varianza_train = 0
                list_of_errors_train = []

                error_test = 0
                bias_test = 0
                varianza_test = 0
                list_of_errors_test = []

                for i in range(number_of_subsets):
                    clf.fit(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])

                    error_i_train = 1 - clf.score(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])
                    error_i_test = 1 - clf.score(k_subsets_data[i]['X_test'], k_subsets_data[i]['y_test'])

                    error_i_train = round(error_i_train,6)
                    error_i_test = round(error_i_test,6)

                    list_of_errors_train.append(error_i_train)
                    list_of_errors_test.append(error_i_test)

                error_train = sum(list_of_errors_train) / len(list_of_errors_train)
                error_test = sum(list_of_errors_test) / len(list_of_errors_test)


                varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
                varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)

                bias_train = math.sqrt(error_train - varianza_train)
                bias_test = math.sqrt(error_test - varianza_test)


                data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
                data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

                dataframe.append(data_aux)

    dataframe_bootstrap = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_bootstrap

In [7]:
# df = test_k_fold(min_cuts=7, max_cuts=7, max_folds=3, neighbors=[1, 3], k_fold_func=utils.get_stratified_k_fold_cross_validation)
# df

# test_bootstrap(3, 3, 3, 0.7, [1, 3])

In [8]:
min_cuts = 3
max_cuts = 8
max_folds = 8
neighbors = [1, 3, 5, 20, 50, 100, 200]
k_fold_func = utils.get_stratified_k_fold_cross_validation

resultados_test_stratified_k_fold = test_k_fold(min_cuts, max_cuts, max_folds, neighbors, k_fold_func)

In [9]:
min_cuts = 3
max_cuts = 8
max_folds = 8
neighbors = [1, 3, 5, 20, 50, 100, 200]
k_fold_func = utils.get_non_stratified_k_fold_cross_validation

resultados_test_non_stratified_k_fold = test_k_fold(min_cuts, max_cuts, max_folds, neighbors, k_fold_func)

In [10]:
min_cuts = 3
max_cuts = 8
max_subsets = 6
training_sample = 0.7
neighbors = [1, 3, 5, 20, 50, 100, 200]

resultados_test_bootstrap = test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample, neighbors)

In [13]:
output_path = 'Resultados/knn/'
resultados_test_stratified_k_fold.to_csv(output_path + 'stratified_k_fold.csv', sep=';')
resultados_test_non_stratified_k_fold.to_csv(output_path + 'non_stratified_k_fold.csv', sep=';')
resultados_test_bootstrap.to_csv(output_path + 'bootstrap.csv', sep=';')