### Imports

In [2]:
import utilitarios as utils #Funciones propias
import pandas as pd
import numpy as np
import cv2
from sklearn.svm import NuSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

### Semilla

In [3]:
random_seed = 42 #Number of life :) 

In [14]:
X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations=5)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


clf = make_pipeline(StandardScaler(), 
                    NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                random_state=random_seed, decision_function_shape='ovr', break_ties=True))


clf.fit(X_train, y_train)
print(clf.score(X_train,y_train))

0.9586321014892444


### Cantidad de muestras por categoria

In [4]:
X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations=1)

categories, counts =  np.unique(y, return_counts=True)
dict(zip(categories, counts))

{0: 195,
 1: 2158,
 2: 2466,
 3: 1599,
 4: 2239,
 5: 2210,
 6: 522,
 7: 1640,
 8: 1648,
 9: 1560}

### K-fold Cross Validation Estratificado

In [5]:
def test_stratified_k_fold(min_cuts , max_cuts, max_folds):
    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X']


    for number_of_folds in range(2, max_folds+1):
        columns.append(str(number_of_folds)+' folds Acc.(%)')

    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])

        data_aux = [iteration, X_length]

        for number_of_folds in range(2, max_folds+1):
            k_folds_data = utils.get_stratified_k_fold_cross_validation(X, y, number_of_folds, random_seed)

            clf = make_pipeline(StandardScaler(), 
                                NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                            random_state=random_seed, decision_function_shape='ovr', break_ties=True))

            accuracy = 0

            for i in range(number_of_folds):
                clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
                accuracy += (clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test']))/number_of_folds

            data_aux.append(round(accuracy*100,2))
        dataframe.append(data_aux)

    dataframe_estratificado = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_estratificado

### K-fold Cross Validation no Estratificado

In [6]:
def test_non_stratified_k_fold(min_cuts, max_cuts, max_folds):
    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X']


    for number_of_folds in range(2, max_folds+1):
        columns.append(str(number_of_folds)+' folds Acc.(%)')

    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])

        data_aux = [iteration, X_length]

        for number_of_folds in range(2, max_folds+1):
            k_folds_data = utils.get_non_stratified_k_fold_cross_validation(X, y, number_of_folds, random_seed)

            clf = make_pipeline(StandardScaler(), 
                                NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                            random_state=random_seed, decision_function_shape='ovr', break_ties=True))

            accuracy = 0

            for i in range(number_of_folds):
                clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
                accuracy += (clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test']))/number_of_folds

            data_aux.append(round(accuracy*100,2))
        dataframe.append(data_aux)

    dataframe_no_estratificado = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_no_estratificado

### Bootstrapping

In [7]:
def test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample):

    X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

    dataframe = []
    columns = ['Number of cuts', 'Length of X', 'K subsets',
                'Train error', 'Train var. error', 'Train bias error', 'Train list errors',
                'Test error', 'Test var. error', 'Test bias error', 'Test list errors',
                ]


    for iteration in range(min_cuts, max_cuts+1):
        X, X_raw = utils.iterate_data(X_raw)
        X = utils.normalization(X)
        X_length = len(X[0])


        for number_of_subsets in range(1, max_subsets+1):
            data_aux = [iteration, X_length, number_of_subsets]

            k_subsets_data = utils.get_bootstrap_subsets(X, y, number_of_subsets, training_sample, random_seed)

            clf = make_pipeline(StandardScaler(), 
                                NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                            random_state=random_seed, decision_function_shape='ovr', break_ties=True))

            error_train = 0
            bias_train = 0
            varianza_train = 0
            list_of_errors_train = []

            error_test = 0
            bias_test = 0
            varianza_test = 0
            list_of_errors_test = []

            for i in range(number_of_subsets):
                clf.fit(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])

                error_i_train = 1 - clf.score(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])
                error_i_test = 1 - clf.score(k_subsets_data[i]['X_test'], k_subsets_data[i]['y_test'])

                list_of_errors_train.append(error_i_train)
                list_of_errors_test.append(error_i_test)

            error_train = sum(list_of_errors_train) / len(list_of_errors_train)
            error_test = sum(list_of_errors_test) / len(list_of_errors_test)


            varianza_train = sum((x-error_train)**2 for x in list_of_errors_train) / len(list_of_errors_train)
            varianza_test = sum((x-error_test)**2 for x in list_of_errors_test) / len(list_of_errors_test)



            data_aux.extend([error_train,varianza_train, bias_train,list_of_errors_train])
            data_aux.extend([error_test,varianza_test, bias_test,list_of_errors_test])

            dataframe.append(data_aux)

    dataframe_bootstrap = pd.DataFrame(data = dataframe, columns = columns)
    return dataframe_bootstrap

### Experimientación

In [8]:
min_cuts = 3
max_cuts =  8
max_folds = 8

#resultados_test_stratified_k_fold = test_stratified_k_fold(min_cuts,max_cuts,max_folds)

min_cuts = 3
max_cuts =  8
max_folds = 8

#resultados_test_non_stratified_k_fold = test_non_stratified_k_fold(min_cuts,max_cuts,max_folds)

min_cuts = 3
max_cuts =  3
max_subsets = 3
training_sample = 0.7

resultados_test_bootstrap = test_bootstrap(min_cuts, max_cuts, max_subsets, training_sample)
resultados_test_bootstrap

### Exportar resultados

In [9]:
output_path = 'Resultados/radial_basis_kernel/'

# resultados_test_stratified_k_fold.to_csv(output_path + 'stratified_k_fold.csv', delimiter = ";")
# resultados_test_non_stratified_k_fold.to_csv(output_path + 'non_stratified_k_fold.csv', delimiter = ";")
# resultados_test_bootstrap.to_csv(output_path + 'bootstrap.csv', delimiter = ";")