## Imports

In [1]:
import pandas as pd
import numpy as np
import warnings
import pickle

from sklearn.model_selection import GridSearchCV

from sklearn import datasets, linear_model

from sklearn.feature_selection import RFE, RFECV

from sklearn.decomposition import PCA

from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm


import xgboost as xgb
import lightgbm as lbm

from sklearn.utils import resample

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, BorderlineSMOTE, SMOTENC, SVMSMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, NearMiss, EditedNearestNeighbours, InstanceHardnessThreshold, TomekLinks

from imblearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler

from collections import Counter

import itertools

import matplotlib.pyplot as plt

#smote_sampler = SMOTE()

import warnings
warnings.filterwarnings('ignore') 

### Print confusion matrix

In [2]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #    print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

    
def plot_confusion_matrixes(y_test, y_pred):
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix

    plt.figure()
    plt.subplots(1,2,figsize=(20,4))
    plt.subplot(1,2,1)
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.subplot(1,2,2)
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix')

    plt.tight_layout()
    plt.show()

## Score Function

In [3]:
def get_scores(y_test, y_pred):
    scores = []
    
    scores.append(f1_score(y_test, y_pred, average='micro'))
    #print("F1-Score(micro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average='macro'))
    #print("F1-Score(macro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average=None))
    #print("F1-Score(None): " + str(scores[-1]))
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    #Sensitivity
    sensitivity = tp / (tp+fn)
    scores.append(tp / (tp+fn))
    #print("Sensitivity: " + str(scores[-1]))
    
    #Specificity
    specificity = tn / (tn+fp)
    scores.append (tn / (tn+fp))
    #print("Specificity: " + str(scores[-1]))
    
    #VPP
    scores.append(tp / (tp+fp))
    #print("VPP: " + str(scores[-1]))
    
    #VPN
    scores.append(tn / (tn+fn))
    #print("VPN: " + str(scores[-1]))
    
    #RVP
    scores.append(sensitivity / (1-specificity))
    #print("RVP: " + str(scores[-1]))
    
    #RVN
    scores.append((1 - sensitivity) / specificity)
    #print("RVN: " + str(scores[-1]))
    
    #ROC_AUC
    scores.append(roc_auc_score(y_test, y_pred))
    #print("ROC_AUC: " + str(scores[-1]))
        
    scores.append([tn, fp, fn, tp])
    
    return scores

## PickleToTSV

In [None]:
def pickleToTSV(pickle_file):
    #partial_results_baselines_default.pickle
    file = open(pickle_file, 'rb')
    object_file = pickle.load(file)
    file.close()
    
    for method in object_file.keys():
        h = np.zeros((9, 10))

        fold_column = 0

        for fold in object_file[method].keys():
            z = object_file[method][fold][0]
            del z[-1]
            del z[2]

            for metric_index in range(len(z)):
                h[metric_index][fold_column] = z[metric_index]

            fold_column = fold_column + 1

        print(method, end="\t")
        for i in range(9):
            print(str(round(np.mean(h[i]), 3)) + "(± " + str(round(np.std(h[i]), 3)) + ")" , end="\t")
        
        print("")
            

## Print classifiers data

In [None]:
def print_classifier_data(classifiers_data):
    for i in classifiers_data:
        print ("Metodo aplicado: " + str(i[0]) + "; ROC_Treino: " + str(i[1].best_score_) + "; ROC_Teste: " + str(i[3][-1]) + "; Diferença: " + str(i[3][-1] - i[1].best_score_ ))

## Outliers counter

In [5]:
def outliers_counter(df):

    outliers_index_list = list()

    for index, row in df.iterrows():
        outlier_counter = 0
        if row.CBO > boxplot_max_list[0]:
            outlier_counter = outlier_counter + 1

        if row.CC > boxplot_max_list[1]:
            outlier_counter = outlier_counter + 1

        if row.DIT > boxplot_max_list[2]:
            outlier_counter = outlier_counter + 1

        if row.LCOM > boxplot_max_list[3]:
            outlier_counter = outlier_counter + 1

        if row.LOC > boxplot_max_list[4]:
            outlier_counter = outlier_counter + 1

        if row.NOC > boxplot_max_list[5]:
            outlier_counter = outlier_counter + 1

        if row.RFC > boxplot_max_list[6]:
            outlier_counter = outlier_counter + 1

        if row.WMC > boxplot_max_list[7]:
            outlier_counter = outlier_counter + 1

        if outlier_counter >= 4:
            outliers_index_list.append(index)
    
    return outliers_index_list