In [4]:
import utilitarios as utils # Funciones propias
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import math

### Semilla

In [5]:
random_seed = 2022 # Number of life :)

### K-fold Cross Validation

In [6]:
def test_k_fold(min_cuts , max_cuts, max_folds, neighbors, k_fold_func):
    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K fold', 'Neighbors',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]

    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_folds in range(2, max_folds+1):
            k_folds_data = k_fold_func(X, y, number_of_folds, random_seed)

            for neighbor in neighbors:
                data_aux = [iteration, X_length, number_of_folds, neighbor]

                clf = KNeighborsClassifier(n_neighbors=neighbor)

                error_train = 0
                bias_train = 0
                varianza_train = 0
                list_of_errors_train = []

                error_test = 0
                bias_test = 0
                varianza_test = 0
                list_of_errors_test = []

                for i in range(number_of_folds):
                    clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])

                    error_i_train = 1 - clf.score(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
                    error_i_test = 1 - clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test'])

                    error_i_train = round(error_i_train,6)
                    error_i_test = round(error_i_test,6)

                    list_of_errors_train.append(error_i_train)
                    list_of_errors_test.append(error_i_test)

                error_train = sum(list_of_errors_train) / len(list_of_errors_train)
                error_test = sum(list_of_errors_test) / len(list_of_errors_test)


                varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
                varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)

                bias_train = math.sqrt(error_train - varianza_train)
                bias_test = math.sqrt(error_test - varianza_test)


                data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
                data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

                dataframe.append(data_aux)

    dataframe_estratificado = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_estratificado

### Bootstrap

In [7]:
def test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample, neighbors):

    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K subsets', 'Neighbors',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]


    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_subsets in range(1, max_subsets+1):
            k_subsets_data = utils.get_bootstrap_subsets(X, y, number_of_subsets, training_sample, random_seed)

            for neighbor in neighbors:
                data_aux = [iteration, X_length, number_of_subsets, neighbor]

                clf = KNeighborsClassifier(n_neighbors=neighbor)

                error_train = 0
                bias_train = 0
                varianza_train = 0
                list_of_errors_train = []

                error_test = 0
                bias_test = 0
                varianza_test = 0
                list_of_errors_test = []

                for i in range(number_of_subsets):
                    clf.fit(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])

                    error_i_train = 1 - clf.score(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])
                    error_i_test = 1 - clf.score(k_subsets_data[i]['X_test'], k_subsets_data[i]['y_test'])

                    error_i_train = round(error_i_train,6)
                    error_i_test = round(error_i_test,6)

                    list_of_errors_train.append(error_i_train)
                    list_of_errors_test.append(error_i_test)

                error_train = sum(list_of_errors_train) / len(list_of_errors_train)
                error_test = sum(list_of_errors_test) / len(list_of_errors_test)


                varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
                varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)

                bias_train = math.sqrt(error_train - varianza_train)
                bias_test = math.sqrt(error_test - varianza_test)


                data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
                data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

                dataframe.append(data_aux)

    dataframe_bootstrap = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_bootstrap

In [8]:
# df = test_k_fold(min_cuts=7, max_cuts=7, max_folds=3, neighbors=[1, 3], k_fold_func=utils.get_stratified_k_fold_cross_validation)
# df

test_bootstrap(3, 3, 3, 0.7, [1, 3])

Unnamed: 0,Number of cuts,Length of X,K subsets,Neighbors,Train error,Train var. error,Train bias error,Train list errors,Test error,Test var. error,Test bias error,Test list errors
0,3,480,1,1,0.0,0.0,0.0,[0.0],0.541225,0.0,0.73568,[0.541225]
1,3,480,1,3,0.197976,0.0,0.444945,[0.197976],0.619434,0.0,0.787041,[0.619434]
2,3,480,2,1,0.0,0.0,0.0,"[0.0, 0.0]",0.542712,2.209682e-06,0.736688,"[0.541225, 0.544198]"
3,3,480,2,3,0.196657,2e-06,0.443458,"[0.197976, 0.195337]",0.618977,2.093062e-07,0.78675,"[0.619434, 0.618519]"
4,3,480,3,1,0.0,0.0,0.0,"[0.0, 0.0, 0.0]",0.542858,1.516242e-06,0.736788,"[0.541225, 0.544198, 0.543152]"
5,3,480,3,3,0.195865,2e-06,0.442563,"[0.197976, 0.195337, 0.194281]",0.619199,2.388469e-07,0.786892,"[0.619434, 0.618519, 0.619645]"
