## Classificador hierárquico para características morfométricas de núcleo de células cervicais 

In [None]:
#!pip install pyefd

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#import sys
#sys.path.insert(0,"/content/drive/MyDrive/shape_based_CRIC_Hclassifier")

In [None]:
import numpy as np
import pandas as pd 
from math import sqrt
import os
import sys
import csv
from collections import Counter
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow 
from skimage import morphology, measure
from skimage.draw import polygon, polygon_perimeter
from scipy.spatial.distance import cdist
from scipy.stats import kurtosis

import pyefd
from pyefd import elliptic_fourier_descriptors, normalize_efd

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import feature_selection as fs
from sklearn import preprocessing

from datetime import datetime

# pay attention to capitalization below!
from spFSR import SpFSR
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE

from itertools import cycle
from random import randint
from random import sample

import xgboost as xgb 

import functions, shapeFeatures

In [None]:
Bethesda_classes = {'Normal':0, 'ASC-US':1, 'ASC-H':2, 'LSIL':3,'HSIL':4, 'Invasive Carcinoma':5} 
Bethesda_idx_classes = {0: 'Normal', 1:'ASC-US', 2:'ASC-H', 3:'LSIL',4: 'HSIL', 5:'Invasive Carcinoma'} 

### Features:

In [None]:
len(functions.list_all_features(20)), len(functions.list_all_nucleus_features(20)), len(functions.list_all_cyto_features(20)), len(functions.list_all_EFD_features(20))


### Lê arquivo (features):

In [None]:
N_EFD_COEFFS = 20

In [None]:
df = pd.read_csv('dataCRIC.csv', sep='|', header=0)
#df = pd.read_csv('/content/drive/MyDrive/shape_based_CRIC_Hclassifier/dataCRIC.csv', sep='|', header=0)
df = shapeFeatures.normalize_dataset(df, n_efd_coeffs= N_EFD_COEFFS)
 

In [None]:
df

In [None]:
# Separa dados por classe de maneira balanceada:
data_normal = df[df['bethesda'] == 0].copy()
data_normal.set_index((i for i in range(data_normal.shape[0])), inplace=True)

data_ascus = df[df['bethesda'] == 1].copy()
data_ascus.set_index((i for i in range(data_ascus.shape[0])), inplace=True)

data_asch = df[df['bethesda'] == 2].copy()
data_asch.set_index((i for i in range(data_asch.shape[0])), inplace=True)

data_lsil = df[df['bethesda'] == 3].copy()
data_lsil.set_index((i for i in range(data_lsil.shape[0])), inplace=True)

data_hsil = df[df['bethesda'] == 4].copy()
data_hsil.set_index((i for i in range(data_hsil.shape[0])), inplace=True)

data_car = df[df['bethesda'] == 5].copy()
data_car.set_index((i for i in range(data_car.shape[0])), inplace=True)

print("--- Totais por classe --- ")               
print("Normal...: ", data_normal.values.shape[0])               
print("ASC-Us...: ", data_ascus.values.shape[0])               
print("ASC-H....: ", data_asch.values.shape[0])               
print("LSIL.....: ", data_lsil.values.shape[0])               
print("HSIL.....: ", data_hsil.values.shape[0])               
print("Carcinoma: ", data_car.values.shape[0]) 
 

#### Gera dataframes: dados (data), classes (target) e Ids (image/cell)

In [None]:
# Monta base (data, target, image/cells ids)
data, target, image_cells_ids= functions.get_database_data_targe_ids(data_normal, data_ascus, 
                       data_lsil, data_asch, data_hsil,data_car,
                       functions.list_all_nucleus_features(N_EFD_COEFFS))
 

In [None]:
## Parâmetros resultantes de gridsearch dos modelos (tuning.ipynb)
svm_param =  {'C': 100, 'kernel': 'linear'}
rf_param = {'max_depth': 7, 'min_samples_split': 10, 'n_estimators': 50}
xgb_param = {'learning_rate': 0.1, 'n_estimators': 86, 'max_depth': 9, 'min_child_weight':1,
              'gamma':0, 'subsample':0.9, 'colsample_bytree':0.7, 'reg_alpha': 0.001}

In [None]:
classifiers = ['SVM', 'RF', 'XGBoost']
params = [svm_param, rf_param, xgb_param]
features = functions.list_all_nucleus_features(N_EFD_COEFFS)
print(f'Nº total de de features: {len(features)}')

In [None]:
#### Features resultantes seleção prévia (Vide arquivos <features_selection>.ipynb)

best_features_MI_1 =   ['mrdN','eN','maxAxN','periN','ardN','hAreaN','areaN','equidiaN','fdN',
                        'riN','circuN','compacN','minAxN','convexN','elonN','extentN','kN','raN',
                        'efdN72','eccenN','efdN74','efdN69','efdN76','efdN77','solidN','efdN70',
                        'efdN73','efdN71','efdN75','efdN61']
best_features_MI_2 =   ['areaN','equidiaN','fdN','hAreaN','periN','riN','ardN','minAxN','maxAxN',
                        'mrdN','eN','efdN71','efdN75','efdN70','efdN74','elonN','efdN58','efdN63',
                        'convexN','efdN55','efdN54','efdN38','efdN66','efdN51','efdN50','efdN62',
                        'efdN42','efdN46','efdN4','efdN12']
best_features_MI_3 = ['efdN76','efdN12','efdN39','extentN','mrdN','efdN15','minAxN','efdN33',
                      'efdN37','efdN25','efdN62','efdN65','efdN2','efdN55','efdN38','efdN36',
                      'efdN24','efdN1','efdN69','sdnrlN','efdN52','elonN','efdN3','efdN41',
                      'efdN26','riN','fdN','efdN74','efdN6','efdN23']
best_features_MI_4 =  ['convexN','efdN77','efdN4','efdN33','efdN19','extentN','efdN9','efdN60',
                       'efdN17','efdN20','efdN43','raN','efdN18','efdN15','efdN12','efdN63',
                       'efdN44','fdN','riN','eN','eccenN','efdN47','efdN64','circuN','compacN',
                       'efdN10','solidN','efdN29','mrdN','efdN58'] 

best_features_spfsr_1 = ['ardN','circuN','compacN','efdN75','periN','efdN4','eN','efdN71',
                         'equidiaN','mrdN','efdN66','efdN41','efdN70','areaN','efdN63','extentN',
                         'riN','efdN17','efdN6','solidN','efdN76','maxAxN','elonN','efdN16',
                         'efdN60','convexN','efdN20','efdN30','efdN59','sdnrlN']  
best_features_spfsr_2 =   ['convexN','periN','equidiaN','areaN','solidN','sdnrlN','efdN6','efdN27',
                           'eN','efdN28','efdN52','efdN68','efdN14','efdN18','efdN16','extentN',
                           'compacN','raN','efdN53','efdN13','fdN','ardN','hAreaN','kN','efdN30',
                           'minAxN','efdN59','efdN12','efdN34','efdN1']
best_features_spfsr_3 =    ['solidN','efdN8','efdN9','raN','elonN','efdN42','efdN27','efdN4','efdN76',
                            'efdN56','efdN7','circuN','efdN75','efdN71','efdN36','efdN69','efdN74',
                            'efdN40','efdN68','compacN','efdN63','efdN44','efdN2','extentN','riN',
                            'efdN57','fdN','efdN46','efdN23','efdN18']
best_features_spfsr_4 =  ['convexN','elonN','efdN1','minAxN','raN','hAreaN','efdN49','solidN',
                          'efdN75','efdN57','efdN70','efdN72','efdN68','circuN','efdN44','eccenN',
                          'extentN','efdN74','areaN','efdN4','compacN','periN','efdN23','efdN27',
                          'kN','efdN69','mrdN','efdN25','efdN73','efdN53'] 

In [None]:
len(best_features_MI_1), len(best_features_MI_2), len(best_features_MI_3), len(best_features_MI_4)

## Experiment nº2:   features (only nucleus)

In [None]:
features_desc = "Selected features N (+EFD's) - hierarchy"
N_FEATURES = 30
N_ITER = 10
 
accs = np.zeros((3))
precs = np.zeros((3))
recs = np.zeros((3))
specs = np.zeros((3))
f1_scores = np.zeros((3))
aucs = np.zeros((3))


labels_list_bin = [] 
roc_curve_list_bin = []

preds_to_conf_matrix_bin= []
preds_to_conf_matrix_ter= []
preds_to_conf_matrix_bet= []

results_bin = pd.DataFrame(columns=['Tipo', 'Model', 'Features', 'Acurácia', 'Precisão', 'Sensibil' , 'Falso Pos', 'Especif', 'F1_measure'])
results_ter = pd.DataFrame(columns=['Tipo', 'Model', 'Features', 'Acurácia', 'Precisão', 'Sensibil' , 'Falso Pos', 'Especif', 'F1_measure'])
results_bet = pd.DataFrame(columns=['Tipo', 'Model', 'Features', 'Acurácia', 'Precisão', 'Sensibil' , 'Falso Pos', 'Especif', 'F1_measure'])

mean_fpr = np.linspace(0, 1, 100)


In [None]:
# Prepara array para registro de predições (classific. binário, ternário e bethesda) separado por algoritmos:
preds_bin = np.ones((data.shape[0],3))*-1
probs_bin = np.zeros((data.shape[0],3,2))
 
preds_ter = np.ones((data.shape[0],3))*-1
probs_ter = np.zeros((data.shape[0],3,2))

preds_bet = np.ones((data.shape[0],3))*-1
probs_bet = np.zeros((data.shape[0],3,2))

In [None]:
le_2 = preprocessing.LabelEncoder()
le_2.fit([1,2])
le_3 = preprocessing.LabelEncoder()
le_3.fit([1,3])
le_4 = preprocessing.LabelEncoder()
le_4.fit([2,4,5])

In [None]:
# Loop principal:  (cross_val )

cv = StratifiedKFold(n_splits=N_ITER, random_state=None)

# Split com rótulos Bethesda para um split estratificado (cada iteração executa todos os classificadores de 1 à 4)
# Separa dados para treino/validação e teste:
for it, (idx_train, idx_test) in enumerate(cv.split(data.values, target['bethesda'].values)):
    print('Iteração número: ', it)

    # Filtra apenas features selecionadas
    X_train = data[best_features_MI_1].values[idx_train]
    y_train = target['binary'].values[idx_train]
    
    X_test = data[best_features_MI_1].values[idx_test]
    y_test = target['binary'].values[idx_test]
                                                   
    ## treino e teste dos modelo (classificador 1):
    for i in range(3):   
        ## Obtem modelo
        model = functions.getModel(params= params[i], classifier = classifiers[i], class_type = 'binary')
        metr, model = functions.fit_model(X_train, y_train, model, cls_type= 1)
        # Predição:
        pred_y = np.empty(len(idx_test)) 
        pred_y = model.predict(X_test)
        prob_y = model.predict_proba(X_test)

        # Registra predições:
        preds_bin[idx_test, i] = pred_y
        probs_bin[idx_test, i] = prob_y
        
        # Registra predições (classicações ternária/bethesda):
        idx_0 = functions.index_pred_from_class(idx_test, pred_y, cls=0)
        preds_ter[idx_0, i] =  preds_bin[idx_0, i]
        probs_ter[idx_0, i] =  probs_bin[idx_0, i] 
        preds_bet[idx_0, i] =  preds_bin[idx_0, i]
        probs_bet[idx_0, i] =  probs_bin[idx_0, i]  

    ##------------ X ------------- 
    ## Classificador 2: lesões de alto/baixo grau
    
    ## Seleciona amostras para treino/teste
    # Treino: seleciona apenas amostras do conjunto de treino para rótulos ternários 1 e 2, filtrando 
    # apenas features selecionadas para o classificador 2
    X_df_train2, y_df_train2 = functions.filter_dataXY(data[best_features_MI_2].loc[idx_train],
                                       target.loc[idx_train], 2)
    X_train2, y_train2 = X_df_train2.values,  y_df_train2.values
        
    for i in range(3):   
        ## Obtem dados para teste de acordo com a predição de cada modelo do classificador 1
        # Teste: filtra amostras de rótulos 1 das predições do classificador 1  
        idx_test2, X_df_test2, y_df_test2 = functions.filter_Xy_from_cls1_to_cls2(data[best_features_MI_2].loc[idx_test],
                                                        target.loc[idx_test], preds_bin[:,i], idx_test)
        
        X_test2, y_test2= X_df_test2.values, y_df_test2.values
 
        ## Obtem modelo
        model = functions.getModel(params= params[i], classifier = classifiers[i], class_type = 'binary')
        metr, model = functions.fit_model(X_train2, y_train2, model, cls_type= 2)
        #print('metricas :', i, metr)
        
        # Predição:
        pred2_y = np.empty(len(idx_test2))
        pred2_y = model.predict(X_test2)
        pred2_y = le_2.inverse_transform(pred2_y)
        #prob_y = model.predict_proba(X_test2)

        # Registra predições:
        preds_ter[idx_test2, i] = pred2_y
        #probs_ter[idx_test2, i] = prob_y        
        
    ##------------ X ------------- 
    ## Classificador 3: ASC-US/LSIL
    
    ## Seleciona amostras para treino/teste
    # Treino: seleciona apenas amostras do conjunto de treino para rótulos bethesda 1 e 3, filtrando 
    # apenas features selecionadas para o classificador 3
    X_df_train3, y_df_train3 = functions.filter_dataXY(data[best_features_MI_3].loc[idx_train],
                                       target.loc[idx_train], 3)
    X_train3, y_train3 = X_df_train3.values,  y_df_train3.values
    
    for i in range(3):   
        ## Obtem dados para teste de acordo com a predição de cada modelo do classificador 2
        # Filtra amostras de rótulos 1 (lesão de baixo grau) das predições dos classificadores 2  
        idx_test3, X_df_test3, y_df_test3 = functions.filter_Xy_from_cls1_to_cls3(data[best_features_MI_3].loc[idx_test],
                                                        target.loc[idx_test], preds_ter[:,i], idx_test)
        
        X_test3, y_test3= X_df_test3.values, y_df_test3.values
 
        ## Obtem modelo
        model = functions.getModel(params= params[i], classifier = classifiers[i], class_type = 'binary')
        metr, model = functions.fit_model(X_train3, y_train3, model, cls_type= 3)
        #print('--metricas Classificador 3 :', i, metr)
        
        # Predição:
        pred3_y = np.empty(len(idx_test3))
        pred3_y = model.predict(X_test3)
        pred3_y = le_3.inverse_transform(pred3_y)
        #prob_y = model.predict_proba(X_test3)

        # Registra predições:
        preds_bet[idx_test3, i] = pred3_y
        #probs_bet[idx_test3, i] = prob_y
        
    ##------------ X ------------- 
    ## Classificador 4: ASC-H/HSIL/Car
    
    ## Seleciona amostras para treino/teste
    # Treino: seleciona apenas amostras do conjunto de treino para rótulos bethesda 2,4,5, filtrando 
    # apenas features selecionadas para o classificador 4
    X_df_train4, y_df_train4 = functions.filter_dataXY(data[best_features_MI_4].loc[idx_train],
                                       target.loc[idx_train], 4)
    X_train4, y_train4 = X_df_train4.values,  y_df_train4.values
    
    
    for i in range(3):   
        ## Obtem dados para teste de acordo com a predição de cada modelo do classificador 2
        # Filtra amostras de rótulos 2(lesão de alto grau) das predições dos classificadores 2  
        idx_test4, X_df_test4, y_df_test4 = functions.filter_Xy_from_cls2_to_cls4(data[best_features_MI_4].loc[idx_test],
                                                        target.loc[idx_test], preds_ter[:,i], idx_test)
        
        X_test4, y_test4= X_df_test4.values, y_df_test4.values
 
        ## Obtem modelo
        model = functions.getModel(params= params[i], classifier = classifiers[i], class_type = 'ternary')
        metr, model = functions.fit_model(X_train4, y_train4, model, cls_type= 4)
        #print('metricas :', i, metr)
        
        # Predição:
        pred4_y = np.empty(len(idx_test4))
        pred4_y = model.predict(X_test4)
        pred4_y = le_4.inverse_transform(pred4_y)
        #prob_y = model.predict_proba(X_test4)

        # Registra predições:
        preds_bet[idx_test4, i] = pred4_y
        #probs_bet[idx_test4, i] = prob_y
 
## Resultados - classificação binária (normal/anormal):
# Calcula curva_roc e AUC:
for i in range(3):   
    prob = probs_bin[:, i, 1]
    fpr, tpr, thresholds = roc_curve(target['binary'].values, prob)
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    aucs[i]= auc(mean_fpr, interp_tpr)
    labels_list_bin.append(r"ROC Curve (AUC %s= %0.4f)" % ((classifiers[i]+"- normal/anormal"), aucs[i]))
    roc_curve_list_bin.append((mean_fpr, interp_tpr))
    
    # Calcula e registra métricas p/ fold:
    accs[i] = functions.calc_metric(target['binary'].values, preds_bin[:,i], metric_type='acc', class_type='binary', pos_label=1, classes=[0,1])
    precs[i] = functions.calc_metric(target['binary'].values, preds_bin[:,i], metric_type='prec',class_type='binary')                
    recs[i] = functions.calc_metric(target['binary'].values, preds_bin[:,i], metric_type='rec',class_type='binary')                
    specs[i] = functions.calc_metric(target['binary'].values, preds_bin[:,i], metric_type='spec',class_type='binary')                
    f1_scores[i] = functions.calc_metric(target['binary'].values, preds_bin[:,i], metric_type='f1_score',class_type='binary')        
    # Acumula métricas no dataframe de resultados e agrupa curvas ROC para exibição:
    metrics= {'Model': classifiers[i], 'acc': accs[i], 'prec': precs[i], 'rec': recs[i], 
              'spec': specs[i], 'f1_score': f1_scores[i], 'AUC': aucs[i]}                                      
    functions.fill_line_metrics_CV(classifiers[i], features_desc, i, metrics, results_bin, class_type='1- Normal/Anormal')            
    # Acumula Matrizes de confusão:  https://stackoverflow.com/questions/61016110/plot-multiple-confusion-matrices-with-plot-confusion-matrix
    preds_to_conf_matrix_bin.append((target['binary'].values, preds_bin[:,i], "1. Normal/Anormal -"+str(classifiers[i])))
            
  
## Resultados - classificação ternária (normal/baixo grau/ alto grau)
# Calcula métricas e matrix de confusão:
for i in range(3):       
    # Calcula e registra métricas p/ fold:
    accs[i] = functions.calc_metric(target['ternary'].values, preds_ter[:,i], metric_type='acc', class_type='ternary', classes=[0,1,2])
    precs[i] = functions.calc_metric(target['ternary'].values, preds_ter[:,i], metric_type='prec',class_type='ternary', classes=[0,1,2])                
    recs[i] = functions.calc_metric(target['ternary'].values, preds_ter[:,i], metric_type='rec',class_type='ternary', classes=[0,1,2])                
    specs[i] = functions.calc_metric(target['ternary'].values, preds_ter[:,i], metric_type='spec',class_type='ternary', classes=[0,1,2])                
    f1_scores[i] = functions.calc_metric(target['ternary'].values, preds_ter[:,i], metric_type='f1_score',class_type='ternary', classes=[0,1,2])        
    # Acumula métricas no dataframe de resultados e agrupa curvas ROC para exibição:
    metrics= {'Model': classifiers[i], 'acc': accs[i], 'prec': precs[i], 'rec': recs[i], 
              'spec': specs[i], 'f1_score': f1_scores[i], 'AUC': aucs[i]}                                      
    functions.fill_line_metrics_CV(classifiers[i], features_desc, i, metrics, results_ter, class_type='2- Normal/Low G./High G.')            
    # Acumula Matrizes de confusão:  https://stackoverflow.com/questions/61016110/plot-multiple-confusion-matrices-with-plot-confusion-matrix
    preds_to_conf_matrix_ter.append((target['ternary'].values, preds_ter[:,i], "2- Normal/Low G./High G. -"+str(classifiers[i])))

    
## Resultados - classificação bethesda (normal/ascus/asch/lsil/hsil/car)
# Calcula métricas e matrix de confusão:
for i in range(3):   
    # Calcula e registra métricas p/ fold:
    accs[i] = functions.calc_metric(target['bethesda'].values, preds_bet[:,i], metric_type='acc', class_type='bethesda', classes=[0,1,2,3,4,5])
    precs[i] = functions.calc_metric(target['bethesda'].values, preds_bet[:,i], metric_type='prec',class_type='bethesda', classes=[0,1,2,3,4,5])                
    recs[i] = functions.calc_metric(target['bethesda'].values, preds_bet[:,i], metric_type='rec',class_type='bethesda', classes=[0,1,2,3,4,5])                
    specs[i] = functions.calc_metric(target['bethesda'].values, preds_bet[:,i], metric_type='spec',class_type='bethesda', classes=[0,1,2,3,4,5])                
    f1_scores[i] = functions.calc_metric(target['bethesda'].values, preds_bet[:,i], metric_type='f1_score',class_type='bethesda', classes=[0,1,2,3,4,5])        
    # Acumula métricas no dataframe de resultados e agrupa curvas ROC para exibição:
    metrics= {'Model': classifiers[i], 'acc': accs[i], 'prec': precs[i], 'rec': recs[i], 
              'spec': specs[i], 'f1_score': f1_scores[i], 'AUC': aucs[i]}                                      
    functions.fill_line_metrics_CV(classifiers[i], features_desc, i, metrics, results_bet, class_type='3- Bethesda')            
    # Acumula Matrizes de confusão:  https://stackoverflow.com/questions/61016110/plot-multiple-confusion-matrices-with-plot-confusion-matrix
    preds_to_conf_matrix_bet.append((target['bethesda'].values, preds_bet[:,i], "3- Bethesda -"+str(classifiers[i])))
   

In [None]:
# Exibe curvas roc, matrizes de confusão e métricas - Classificador binário:
functions.plot_roc_curve_CV(roc_curve_list_bin, labels_list_bin, title = "ROC Curve - 1.Normal/Anormal")
functions.plot_conf_matrix(preds_to_conf_matrix_bin, lbls=[0,1], disp_lbls=['normal', 'anormal'])
results_bin
    

In [None]:
# Exibe matrizes de confusão e métricas - Classificador ternário:
functions.plot_conf_matrix(preds_to_conf_matrix_ter, lbls=[0,1,2], disp_lbls=['normal','low g.', 'high g.'])
results_ter


In [None]:
# Exibe matrizes de confusão e métricas - Classificador ternário:
functions.plot_conf_matrix(preds_to_conf_matrix_bet, lbls=[0,1,2,3,4,5], disp_lbls=['normal','ascus', 'asch', 'lsil', 'hsil', 'car'])
results_bet