### Imports

In [1]:
import utilitarios as utils #Funciones propias
import pandas as pd
import numpy as np
import cv2
import math
from sklearn.svm import NuSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

### Semilla

In [2]:
random_seed = 42 #Number of life :) 

### Cantidad de muestras por categoria

In [3]:
X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations=1)

categories, counts =  np.unique(y, return_counts=True)
dict(zip(categories, counts))

{0: 195,
 1: 2158,
 2: 2466,
 3: 1599,
 4: 2239,
 5: 2210,
 6: 522,
 7: 1640,
 8: 1648,
 9: 1560}

### K-fold Cross Validation

In [4]:
def test_k_fold(min_cuts , max_cuts, max_folds, k_fold_func):
    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K fold',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]

    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_folds in range(2, max_folds+1):
            data_aux = [iteration, X_length, number_of_folds]

            k_folds_data = k_fold_func(X, y, number_of_folds, random_seed)

            clf = make_pipeline(StandardScaler(), 
                                NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                            random_state=random_seed, decision_function_shape='ovr', break_ties=True))

            error_train = 0
            bias_train = 0
            varianza_train = 0
            list_of_errors_train = []

            error_test = 0
            bias_test = 0
            varianza_test = 0
            list_of_errors_test = []

            for i in range(number_of_folds):
                clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])

                error_i_train = 1 - clf.score(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
                error_i_test = 1 - clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test'])

                error_i_train = round(error_i_train,6)
                error_i_test = round(error_i_test,6)

                list_of_errors_train.append(error_i_train)
                list_of_errors_test.append(error_i_test)

            error_train = sum(list_of_errors_train) / len(list_of_errors_train)
            error_test = sum(list_of_errors_test) / len(list_of_errors_test)


            varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
            varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)

            bias_train = math.sqrt(error_train - varianza_train)
            bias_test = math.sqrt(error_test - varianza_test)


            data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
            data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

            dataframe.append(data_aux)

    dataframe = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe

### Bootstrapping

In [5]:
def test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample):

    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K subsets',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]


    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_subsets in range(1, max_subsets+1):
            data_aux = [iteration, X_length, number_of_subsets]

            k_subsets_data = utils.get_bootstrap_subsets(X, y, number_of_subsets, training_sample, random_seed)

            clf = make_pipeline(StandardScaler(), 
                                NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                            random_state=random_seed, decision_function_shape='ovr', break_ties=True))

            error_train = 0
            bias_train = 0
            varianza_train = 0
            list_of_errors_train = []

            error_test = 0
            bias_test = 0
            varianza_test = 0
            list_of_errors_test = []

            for i in range(number_of_subsets):
                clf.fit(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])

                error_i_train = 1 - clf.score(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])
                error_i_test = 1 - clf.score(k_subsets_data[i]['X_test'], k_subsets_data[i]['y_test'])

                error_i_train = round(error_i_train,6)
                error_i_test = round(error_i_test,6)

                list_of_errors_train.append(error_i_train)
                list_of_errors_test.append(error_i_test)

            error_train = sum(list_of_errors_train) / len(list_of_errors_train)
            error_test = sum(list_of_errors_test) / len(list_of_errors_test)


            varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
            varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)

            bias_train = math.sqrt(error_train - varianza_train)
            bias_test = math.sqrt(error_test - varianza_test)


            data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
            data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

            dataframe.append(data_aux)

    dataframe_bootstrap = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_bootstrap

### Experimientación

In [6]:
min_cuts = 3
max_cuts =  8
max_folds = 8

resultados_test_stratified_k_fold = test_k_fold(min_cuts,max_cuts,max_folds,k_fold_func=utils.get_stratified_k_fold_cross_validation)
resultados_test_stratified_k_fold

Unnamed: 0,Number of cuts,Length of X,K fold,Train error,Train var. error,Train bias error,Train list errors,Test error,Test var. error,Test bias error,Test list errors
0,3,480,2,0.027468,7.404603e-07,0.165734,"[0.026608, 0.028329]",0.212108,2.41474e-05,0.460526,"[0.217022, 0.207194]"
1,3,480,3,0.028454,4.770576e-07,0.168681,"[0.028271, 0.029376, 0.027714]",0.176756,5.066776e-05,0.420364,"[0.183263, 0.180155, 0.166851]"
2,3,480,4,0.029562,1.845412e-07,0.171936,"[0.028825, 0.02989, 0.029808, 0.029726]",0.169243,2.616685e-05,0.411359,"[0.175862, 0.171717, 0.167283, 0.162109]"
3,3,480,5,0.029778,8.330122e-07,0.172559,"[0.02864, 0.031026, 0.028945, 0.030562, 0.029715]",0.161606,2.625967e-05,0.401969,"[0.170567, 0.164101, 0.157376, 0.157376, 0.158..."
4,3,480,6,0.029759,5.156348e-07,0.172507,"[0.028899, 0.029931, 0.030596, 0.030301, 0.028...",0.158342,6.389788e-05,0.397841,"[0.165127, 0.153363, 0.17221, 0.150776, 0.1504..."
5,3,480,7,0.030239,8.99132e-07,0.173892,"[0.030754, 0.030466, 0.030969, 0.029173, 0.030...",0.157171,8.939856e-06,0.396437,"[0.159483, 0.159052, 0.160345, 0.159483, 0.152..."
6,3,480,8,0.030292,4.266609e-07,0.174045,"[0.031604, 0.029633, 0.030196, 0.030689, 0.030...",0.152614,3.001696e-05,0.39062,"[0.149261, 0.159606, 0.157635, 0.156158, 0.150..."
7,4,240,2,0.034859,1.224342e-06,0.186701,"[0.033752, 0.035965]",0.274189,9.379922e-07,0.523629,"[0.275157, 0.27322]"
8,4,240,3,0.0388,6.881742e-07,0.196976,"[0.038803, 0.039815, 0.037783]",0.239699,7.302089e-06,0.489583,"[0.243118, 0.239468, 0.236511]"
9,4,240,4,0.039847,1.249681e-06,0.199615,"[0.038269, 0.040729, 0.041058, 0.039333]",0.228983,9.266302e-07,0.478521,"[0.227833, 0.228381, 0.230352, 0.229367]"


In [7]:
min_cuts = 3
max_cuts =  8
max_folds = 8

resultados_test_non_stratified_k_fold = test_k_fold(min_cuts,max_cuts,max_folds,k_fold_func=utils.get_non_stratified_k_fold_cross_validation)
resultados_test_non_stratified_k_fold

Unnamed: 0,Number of cuts,Length of X,K fold,Train error,Train var. error,Train bias error,Train list errors,Test error,Test var. error,Test bias error,Test list errors
0,3,480,2,0.026175,2.010724e-06,0.161781,"[0.027593, 0.024757]",0.217096,1.068122e-06,0.465935,"[0.21813, 0.216063]"
1,3,480,3,0.027745,2.791849e-06,0.16656,"[0.029194, 0.025404, 0.028637]",0.183408,1.432144e-05,0.428245,"[0.184556, 0.187361, 0.178307]"
2,3,480,4,0.029152,8.343547e-07,0.170736,"[0.030549, 0.028905, 0.028001, 0.029151]",0.169058,1.539801e-05,0.411148,"[0.171921, 0.170239, 0.171717, 0.162355]"
3,3,480,5,0.029285,5.071606e-07,0.171127,"[0.03041, 0.029025, 0.02856, 0.028637, 0.029792]",0.164747,0.0001098312,0.405755,"[0.165948, 0.163793, 0.172467, 0.175855, 0.145..."
4,3,480,6,0.02987,6.685678e-07,0.172828,"[0.029786, 0.030744, 0.030079, 0.028158, 0.030...",0.159328,7.081481e-05,0.39907,"[0.156262, 0.162971, 0.160754, 0.17184, 0.1603..."
5,3,480,7,0.029624,3.306785e-07,0.172114,"[0.029891, 0.029604, 0.028383, 0.029748, 0.029...",0.157603,3.923455e-05,0.396943,"[0.156897, 0.156466, 0.161638, 0.159052, 0.164..."
6,3,480,8,0.029817,3.117559e-07,0.172676,"[0.030337, 0.029985, 0.030196, 0.029774, 0.028...",0.153722,0.0001069379,0.391937,"[0.14532, 0.164532, 0.150739, 0.159606, 0.1679..."
7,4,240,2,0.035474,1.23321e-06,0.188343,"[0.036585, 0.034364]",0.277145,1.96e-08,0.526446,"[0.277005, 0.277285]"
8,4,240,3,0.038308,3.110376e-07,0.195723,"[0.03908, 0.037783, 0.03806]",0.243087,8.223473e-06,0.49303,"[0.239239, 0.24612, 0.243902]"
9,4,240,4,0.039929,1.2307e-06,0.19982,"[0.039172, 0.04155, 0.038676, 0.040319]",0.227628,7.848965e-05,0.477022,"[0.22734, 0.22099, 0.242178, 0.220005]"


In [8]:
min_cuts = 3
max_cuts =  8
max_subsets = 6 
training_sample = 0.70

resultados_test_bootstrap = test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample)
resultados_test_bootstrap

Unnamed: 0,Number of cuts,Length of X,K subsets,Train error,Train var. error,Train bias error,Train list errors,Test error,Test var. error,Test bias error,Test list errors
0,3,480,1,0.026485,0.0,0.162742,[0.026485],0.213153,0.0,0.461685,[0.213153]
1,3,480,2,0.027233,5.59504e-07,0.165023,"[0.026485, 0.027981]",0.215512,6e-06,0.464226,"[0.213153, 0.21787]"
2,3,480,3,0.026544,1.323364e-06,0.162918,"[0.026485, 0.027981, 0.025165]",0.216171,5e-06,0.464937,"[0.213153, 0.21787, 0.217489]"
3,3,480,4,0.026705,1.070608e-06,0.163413,"[0.026485, 0.027981, 0.025165, 0.027189]",0.217742,1.1e-05,0.466617,"[0.213153, 0.21787, 0.217489, 0.222457]"
4,3,480,5,0.026503,1.020349e-06,0.162793,"[0.026485, 0.027981, 0.025165, 0.027189, 0.025...",0.217293,9e-06,0.466137,"[0.213153, 0.21787, 0.217489, 0.222457, 0.215495]"
5,3,480,6,0.026646,9.535876e-07,0.163234,"[0.026485, 0.027981, 0.025165, 0.027189, 0.025...",0.218679,1.8e-05,0.467613,"[0.213153, 0.21787, 0.217489, 0.222457, 0.2154..."
6,4,240,1,0.037308,0.0,0.193153,[0.037308],0.27025,0.0,0.519856,[0.27025]
7,4,240,2,0.037044,6.9696e-08,0.192468,"[0.037308, 0.03678]",0.273657,1.2e-05,0.523111,"[0.27025, 0.277064]"
8,4,240,3,0.036604,4.33664e-07,0.191321,"[0.037308, 0.03678, 0.035724]",0.275463,1.4e-05,0.524832,"[0.27025, 0.277064, 0.279076]"
9,4,240,4,0.036494,3.61548e-07,0.191033,"[0.037308, 0.03678, 0.035724, 0.036164]",0.276883,1.7e-05,0.526181,"[0.27025, 0.277064, 0.279076, 0.281141]"


### Exportar resultados

In [9]:
output_path = 'Resultados/radial_basis_kernel/'
resultados_test_stratified_k_fold.to_csv(output_path + 'stratified_k_fold.csv', sep = ";")
resultados_test_non_stratified_k_fold.to_csv(output_path + 'non_stratified_k_fold.csv', sep = ";")
resultados_test_bootstrap.to_csv(output_path + 'bootstrap.csv', sep = ";")