### Imports

In [1]:
import utilitarios as utils #Funciones propias
import pandas as pd
import numpy as np
import cv2
from sklearn.svm import NuSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

### Semilla

In [3]:
random_seed = 42 #Number of life :) 

### Cantidad de muestras por categoria

In [4]:
X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations=1)

categories, counts =  np.unique(y, return_counts=True)
dict(zip(categories, counts))

{0: 195,
 1: 2158,
 2: 2466,
 3: 1599,
 4: 2239,
 5: 2210,
 6: 522,
 7: 1640,
 8: 1648,
 9: 1560}

### K-fold Cross Validation Estratificado

In [5]:
min_cuts = 3
max_cuts =  8
max_folds = 8

X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

dataframe = []
columns = ['Number of cuts', 'Length of X']


for number_of_folds in range(2, max_folds+1):
    columns.append(str(number_of_folds)+' folds Acc.(%)')

for iteration in range(min_cuts, max_cuts+1):
    X, X_raw = utils.iterate_data(X_raw)
    X = utils.normalization(X)
    X_length = len(X[0])

    data_aux = [iteration, X_length]

    for number_of_folds in range(2, max_folds+1):
        k_folds_data = utils.get_stratified_k_fold_cross_validation(X, y, number_of_folds, random_seed)

        clf = make_pipeline(StandardScaler(), 
                            NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                        random_state=random_seed, decision_function_shape='ovr', break_ties=True))

        accuracy = 0

        for i in range(number_of_folds):
            clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
            accuracy += (clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test']))/number_of_folds

        data_aux.append(round(accuracy*100,2))
    dataframe.append(data_aux)

dataframe_estratificado = pd.DataFrame(data = dataframe, columns = columns)
dataframe_estratificado

[3, 480, 78.57]
[3, 480, 78.57, 82.21]
[3, 480, 78.57, 82.21, 83.52]
[3, 480, 78.57, 82.21, 83.52, 84.05]


### K-fold Cross Validation no Estratificado

In [5]:
min_cuts = 3
max_cuts =  8
max_folds = 8

X, y, X_raw = utils.get_data(src_dir="Data_preprocesada/", iterations= min_cuts-1)

dataframe = []
columns = ['Number of cuts', 'Length of X']


for number_of_folds in range(2, max_folds+1):
    columns.append(str(number_of_folds)+' folds Acc.(%)')

for iteration in range(min_cuts, max_cuts+1):
    X, X_raw = utils.iterate_data(X_raw)
    X = utils.normalization(X)
    X_length = len(X[0])

    data_aux = [iteration, X_length]

    for number_of_folds in range(2, max_folds+1):
        k_folds_data = utils.get_non_stratified_k_fold_cross_validation(X, y, number_of_folds, random_seed)

        clf = make_pipeline(StandardScaler(), 
                            NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                        random_state=random_seed, decision_function_shape='ovr', break_ties=True))

        accuracy = 0

        for i in range(number_of_folds):
            clf.fit(k_folds_data[i]['X_train'], k_folds_data[i]['y_train'])
            accuracy += (clf.score(k_folds_data[i]['X_test'], k_folds_data[i]['y_test']))/number_of_folds

        data_aux.append(round(accuracy*100,2))
    dataframe.append(data_aux)

dataframe_no_estratificado = pd.DataFrame(data = dataframe, columns = columns)
dataframe_no_estratificado

### Bootstrapping

In [None]:
min_cuts = 3
max_cuts = 10
max_subsets = 10
training_sample = 0.70

dataframe = []
columns = ['Number of cuts', 'Length of X']

for number_of_subsets in range(1, max_subsets+1):
    columns.append(str(number_of_subsets)+' subsets Acc.(%)')


for iteration in range(min_cuts, max_cuts+1):
    X, y = utils.get_data(src_dir="Data_preprocesada/", iterations=iteration)
    X = utils.normalization(X)
    X_length = len(X[0])

    data_aux = [iteration, X_length]

    for number_of_subsets in range(1, max_subsets+1):
        k_subsets_data = utils.get_bootstrap_subsets(X, y, number_of_subsets, training_sample,random_seed)

        clf = make_pipeline(StandardScaler(), 
                            NuSVC(nu=0.10,kernel='rbf', class_weight = 'balanced',
                                        random_state=random_seed, decision_function_shape='ovr', break_ties=True))

        accuracy = 0

        for i in range(number_of_subsets):
            clf.fit(k_subsets_data[i]['X_train'], k_subsets_data[i]['y_train'])
            accuracy += (clf.score(k_subsets_data[i]['X_test'], k_subsets_data[i]['y_test']))/number_of_subsets

        data_aux.append(round(accuracy*100,2))

    dataframe.append(data_aux)

dataframe_bootstrapping = pd.DataFrame(data = dataframe, columns = columns)
dataframe_bootstrapping