In [1]:
# https://plotly.com/python/t-sne-and-umap-projections/

from glob import glob
from itertools import product
from noise import add_noise, decompress_pickle
import mat73
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from training_sktime import normalizing, format_dataframe, find_max_value
from sklearn.metrics import accuracy_score
import pickle
from  sktime.transformations.panel.reduce import Tabularizer
from sklearn.manifold import TSNE
import time
import plotly.express as px
import plotly.figure_factory as ff

MODEL_PATH = './models/new_dataset/'
INPUT_DATA_PATH = '../input-data/'

In [2]:
def open_folds(cycle, train_test, X_y, v_i):
    """
    Parameters:
        cycle      : which cycle, ex.: 'cycle_1' (1, 2, 4, 8, 16, 32, 64, 128...)
        train_test : if it is the train ot test set, ex: 'train' (train, test)
        X_y        : if it is the X or y set, ex.: 'X' (X, y)
        v_i        : if it is a voltage or current signal, ex.: 'i' (v, i)
    Return:
        list : each fold is in a position.
    """
    paths_flavio = list(map(lambda x: x.split('.pbz2')[0],
                            glob(INPUT_DATA_PATH + f'folds/{v_i}/{cycle}/{X_y}_{train_test}_fold_[0-9]*.pbz2')))
    paths_robson = list(map(lambda x: x.split('.pbz2')[0],
                            glob(INPUT_DATA_PATH + f'folds-robson/{v_i}/{cycle}/{X_y}_{train_test}_fold_[0-9]*.pbz2')))
    paths_flavio.sort(key = lambda x: int(x.split('_')[-1]))
    paths_robson.sort(key = lambda x: int(x.split('_')[-1]))
    data_list = []
    for (path_flavio, path_robson) in zip(paths_flavio, paths_robson):
        folder_pos_flavio = int(path_flavio.split('/')[-1].split('_')[-1]) - 1
        fold_flavio = decompress_pickle(path_flavio)
        fold_robson = decompress_pickle(path_robson)
        if X_y == 'X':
            folds = pd.concat([fold_flavio, fold_robson]).reset_index(drop=True)
        else:
            fold_flavio = np.array(list(map(lambda x: x + '_set1', fold_flavio)))
            fold_robson = np.array(list(map(lambda x: x + '_set2', fold_robson)))
            folds = np.concatenate([fold_flavio, fold_robson])
        data_list.insert(folder_pos_flavio, folds)
    return data_list

In [3]:
def generate_title(cycle, model_name):
    title = cycle.split('_')[-1]
    if title != '1':
        title = f'{model_name.title()} e 1/{title} ciclo pós falta'
    else:
        title = f'{model_name.title()} e 1 ciclo pós falta'
    return title

In [20]:
def evaluating_model(model, transformation, X_test, y_test, cycle, scores, count, max_list, model_name='model', save=None):
    # Evaluating model
    print('y de teste')
    print(y_test)
    y_pred = model.predict(X_test)
    print('y de predição')
    print(y_pred)
    score = model.score(X_test, y_test)
    scores.append(score)
    if save and (
        len(scores) != 1 and score > scores[count] or len(scores) == 1
    ):
        pickle.dump(transformation, open(MODEL_PATH + f'novo_treino_{model_name}_{cycle}.pkl', 'wb'))
        pickle.dump(model, open(MODEL_PATH + f'novo_treino_{model_name}_classifier_{cycle}.pkl', 'wb'))
        pickle.dump(max_list, open(MODEL_PATH + f'novo_treino_{model_name}_{cycle}_max_values.pkl', 'wb'))
    return scores

def print_results(cycle, model_name, scores, end_time, start_time, save=None):
    folds_labels = [f'- Fold {i}' for i in range(1, 11)]
    f = open(f'novo_treino_{model_name}_report.txt','a') if save else save
    title = generate_title(cycle, model_name)
    # print(f'\nAcurácia em cada fold usando {title}:', file=f)
    # for k, v in dict(zip(folds_labels, np.round(scores * 100, decimals=2))).items():
    #     print(f'{k:<7}: {v:^7.2f}%', file=f)
    # print(f'\nMédia da acurácia: {np.mean(scores) * 100:.2f}%', file=f)
    # print(f'Desvio padrão da acurácia: {np.std(scores) * 100:.2f}%', file=f)
    # print(f'Tempo necessário para treinamento: {np.round(end_time - start_time, 3)} segundos', file=f)
    print('\nAcurácia em cada fold:\n')
    for k, v in dict(zip(folds_labels, np.round(scores * 100, decimals=2))).items():
        print(f'{k:<7}: {v:^7.2f}%')
    print('\nO resulto final obtido foi:\n')
    print(f'- Média da acurácia: {np.mean(scores) * 100:.2f}%')
    print(f'- Desvio padrão da acurácia: {np.std(scores) * 100:.2f}%')
    print(f'- Tempo necessário para treinamento: {np.round(end_time - start_time, 3)} segundos')

In [15]:
def kfold(train_X, train_y, test_X, test_y, model, cycle, max_list, model_name='',
          transformation=None, save=None):
    scores = []
    s = time.time()
    for count, (X_tr, y_tr, X_te, y_te) in enumerate(zip(train_X, train_y, test_X, test_y),
                                                     start=-1):
        X_tr_norm = normalizing(X_tr, max_list)
        X_te_norm = normalizing(X_te, max_list)

        # Transforming data
        if transformation:
            X_tr_transform = transformation.transform(X_tr_norm)
            X_te_transform = transformation.transform(X_te_norm)
        else:
            X_tr_transform = X_tr_norm.copy()
            X_te_transform = X_te_norm.copy()

        model.fit(X_tr_transform, y_tr)
        scores = evaluating_model(model, transformation, X_te_transform, y_te, cycle, scores,
                                  count, max_list, model_name, save)

    e = time.time()
    final_scores = np.array(scores)
    print_results(cycle, model_name, final_scores, e, s, save)
    return np.mean(scores) * 100, np.round(e - s, 3)

In [16]:
# def open_tabular_data(signal, cycle):
#     X_train_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/X_train')
#     y_train_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/y_train')
#     X_val_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/X_val')
#     y_val_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/y_val')

#     X_train_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/X_train')
#     y_train_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/y_train')
#     X_val_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/X_val')
#     y_val_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/y_val')

#     X_train = pd.concat([X_train_flavio, X_train_robson]).reset_index(drop=True)
#     y_train = np.concatenate([y_train_flavio, y_train_robson])
#     X_val = pd.concat([X_val_flavio, X_val_robson]).reset_index(drop=True)
#     y_val = np.concatenate([y_val_flavio, y_val_robson])

#     t= Tabularizer()
#     X_train_flavio = t.fit_transform(X_train_flavio)
#     X_train_robson = t.fit_transform(X_train_robson)
#     X_train = t.fit_transform(X_train)
#     feat_cols = list(X_train_flavio.columns)
    
#     X_train_flavio['y'] = y_train_flavio
#     X_train_robson['y'] = y_train_robson
#     X_train['y'] = y_train
    
#     return X_train_flavio, X_train_robson, X_train, feat_cols

In [23]:
def validating(X_val, y_val, model_name, cycle, max_list, save=None):
    s = time.time()
    with open(MODEL_PATH + f'novo_treino_{model_name}_classifier_{cycle}.pkl', 'rb') as f:
        best_model = pickle.load(f)
    val_score = best_model.score(X_val, y_val)
    y_pred = best_model.predict(X_val)
    print('y de validação')
    print(y_val)
    print('y de predição final')
    print(y_pred)
    e = time.time()
    f = open(f'novo_treino_{model_name}_report.txt','a') if save else save
    # print('*' * 73, file=f)
    print(f'- Acurácia no conjunto de validação: {val_score * 100:.2f}%')
    print(f'- Tempo necessário para predição do conjunto de validação: {np.round(e - s, 3)} segundos')
    # print('*' * 73, file=f)
    return y_pred, val_score * 100, np.round(e - s, 3)

def generate_confusion_matrix(y_val, y_pred, image_path, filename, title='', colorscale='blues',
                              width=500, height=500):
    data = {'Real':    y_val,
            'Predito': y_pred}
    df = pd.DataFrame(data, columns=['Real','Predito'])
    confusion_matrix = pd.crosstab(df['Real'], df['Predito'], rownames=['Real'],
                                   colnames=['Predito'], margins = True)
    cm = confusion_matrix.drop('All', axis=1).drop('All', axis=0)

    # Inverte rows because create_annotated_heatmap creates matrix in inverted order
    c = cm.values[::-1]
    x = list(cm.index)
    y = x[::-1]
    c_text = [[str(y) for y in x] for x in c]

    fig = ff.create_annotated_heatmap(c, x=x, y=y, annotation_text=c_text, colorscale=colorscale)

    # add title
    fig.update_layout(title_text=f'<i><b>Matriz de Confusão {title}</b></i>',
                      title_x=0.5, autosize=False, width=width, height=height,)

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14), x=0.5, y=-0.12, showarrow=False,
                            text="Valores Preditos", xref="paper", yref="paper"))

    fig.add_annotation(dict(font=dict(color="black",size=14), x=-0.2, y=0.5, textangle=270,
                            showarrow=False, text="Valores Reais", xref="paper", yref="paper"))
    fig.write_image(image_path + filename + '.svg')

In [24]:
def training(signal, cycle, model, model_name='', transformation=None, save=None):
    X_train_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/X_train')
    X_val_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/X_val')
    y_val_flavio = decompress_pickle(INPUT_DATA_PATH + f'folds/{signal}/{cycle}/y_val')
    # y_val_flavio = np.array(list(map(lambda x: x + '_set1', y_val_flavio)))

    X_train_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/X_train')
    X_val_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/X_val')
    y_val_robson = decompress_pickle(INPUT_DATA_PATH + f'folds-robson/{signal}/{cycle}/y_val')
    # y_val_robson = np.array(list(map(lambda x: x + '_set2', y_val_robson)))

    X_train = pd.concat([X_train_flavio, X_train_robson]).reset_index(drop=True)
    X_val = pd.concat([X_val_flavio, X_val_robson]).reset_index(drop=True)
    y_val = np.concatenate([y_val_flavio, y_val_robson])

    max_list = find_max_value(X_train)
    X_train_norm = normalizing(X_train, max_list)
    X_val_norm = normalizing(X_val, max_list)
        
    if transformation:
        transformation.fit(X_train_norm)
        X_val_transform = transformation.transform(X_val_norm)
    else:
        X_val_transform = X_val_norm.copy()

    # Opening the folds and saving as lists for training and testing 
    train_X = open_folds(cycle, 'train', 'X', signal)
    train_y = open_folds(cycle, 'train', 'y', signal)
    test_X = open_folds(cycle, 'test', 'X', signal)
    test_y = open_folds(cycle, 'test', 'y', signal)
    mean_acc, train_time = kfold(train_X, train_y, test_X, test_y, model, cycle, max_list, model_name, transformation, save)
    y_pred, val_acc, val_time = validating(X_val_transform, y_val, model_name, cycle, max_list, save)
    # title = generate_title(cycle, model_name)
    # generate_confusion_matrix(y_val, y_pred, 'figs_cm/new_dataset/', f'{cycle}_{model_name}', title=title)
    # print(f'Finalizado treinamento para {title}!')
    return mean_acc, val_acc, train_time, val_time

In [25]:
from sklearn.linear_model import RidgeClassifierCV
from sktime.transformations.panel.rocket import Rocket, MiniRocketMultivariate

print('\n### Treinando com 10000 features (default)', sep='')
transformation = MiniRocketMultivariate(random_state=42)
model = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
signal, model_name = 'i', 'minirocket'
for cycle in ['cycle_128']:
    print('\n---')
    c = cycle.split('_')[-1]
    if c == '1':
        title = f'\n## {c} Ciclo Pós Falta'
    else:
        title = f'\n## 1/{c} Ciclo Pós Falta'
    mean_acc, val_acc, train_time, val_time = training(signal, cycle, model, model_name, transformation, save=True)
    row = f'\n|{title.split(" ")[1]}|10000|{mean_acc:.2f}|{val_acc:.2f}|{train_time}|{val_time}|'


### Treinando com 10000 features (default)

---
y de teste
['CAT_set1' 'CAT_set1' 'BCT_set1' 'ABT_set1' 'CA_set1' 'BC_set1' 'AT_set1'
 'BC_set1' 'AT_set1' 'BT_set1' 'AT_set1' 'AB_set1' 'ABC_set1' 'CT_set1'
 'BT_set1' 'BCT_set1' 'AB_set1' 'BCT_set1' 'CAT_set1' 'CT_set1' 'BC_set1'
 'BCT_set1' 'BCT_set1' 'ABT_set1' 'CT_set1' 'BT_set1' 'ABC_set1' 'AT_set1'
 'CA_set1' 'ABC_set1' 'BCT_set1' 'CT_set1' 'BC_set1' 'CAT_set1' 'AB_set1'
 'BT_set1' 'ABC_set1' 'CA_set1' 'BCT_set1' 'ABT_set1' 'AB_set1' 'CA_set1'
 'CAT_set1' 'ABC_set1' 'AT_set1' 'AB_set1' 'BCT_set1' 'CT_set1' 'CAT_set1'
 'BC_set1' 'CA_set1' 'CT_set1' 'AT_set1' 'CT_set1' 'CA_set1' 'ABT_set1'
 'ABC_set1' 'AB_set1' 'AT_set1' 'CT_set1' 'ABT_set1' 'BC_set1' 'ABT_set1'
 'ABT_set1' 'CA_set1' 'AT_set1' 'BT_set1' 'BC_set1' 'BC_set1' 'ABC_set1'
 'BT_set1' 'AB_set1' 'CAT_set1' 'CA_set1' 'CAT_set1' 'BT_set1' 'CAT_set2'
 'CAT_set2' 'BCT_set2' 'ABT_set2' 'CA_set2' 'BC_set2' 'AT_set2' 'BC_set2'
 'AT_set2' 'BT_set2' 'AT_set2' 'AB_set2' 'ABC_set2' 'C