In [1]:
import numpy as np
import pandas as pd
import sweetviz as sv
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
sns.set_style('darkgrid')
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, balanced_accuracy_score, matthews_corrcoef
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve

In [6]:
## Ler o dateset
dados = pd.read_csv('dados_seg_geo.csv',sep='|', index_col=0)

In [8]:
## Excluir as colunas REGIAO, que não é utiliziada e UF_AC para evitar multicolinearidade
dados = dados.drop(columns =['REGIAO','UF__AC'])

In [9]:
## Converter os dados para formato numpy para facilitar o processamento e separação entre atributos (X) e alvo(Y):
data_numpy = dados.to_numpy()
nrow,ncol = dados.shape
y = data_numpy[:,-1]
X = data_numpy[:,0:ncol-1]

In [10]:
## Dividir entre treino e teste antes de fazer a padronização para que a informação contida no conjunto 
## de teste (que está junto com o treino antes da divisão) não influencie na transformação dos dados do conjunto de treino.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 0)

print('Shape X_train:', X_train.shape)
print('Shape X_test:', X_test.shape)
print('Shape y_train:', y_train.shape)
print('Shape y_test:', y_test.shape)

Shape X_train: (27637, 55)
Shape X_test: (11845, 55)
Shape y_train: (27637,)
Shape y_test: (11845,)


In [11]:
np.random.seed(42)

#### Executar o balanceamento das classes utilizando undersampling

rus = RandomUnderSampler(sampling_strategy=1)
X_train, y_train = rus.fit_sample(X_train, y_train)
#X_test, y_test = rus.fit_sample(X_test, y_test)
print('X_train_under:',X_train.shape, 'y_train_under:', y_train.shape)  
print('Soma y_train_under (Classe 1):', y_train.sum())
#print('X_test_under:',X_test.shape, 'y_test_under:', y_test.shape)  
#print('Soma y_test_under:', y_test.sum())
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)
print('Soma y_test (Classe 1):', y_test.sum())


X_train_under: (1198, 55) y_train_under: (1198,)
Soma y_train_under (Classe 1): 599.0
X_test: (11845, 55)
y_test: (11845,)
Soma y_test (Classe 1): 257.0


In [12]:
## Importar o módulo resample para realizar reamostragens utlizando a tecnica de Bootstrapping
from sklearn.utils import resample

In [13]:
## Importar o algoritimo de regressão rogística e o módulo 'metrics' do scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [14]:
### CURVA ROC PARA DETERMINAR A PROBABILIDADE DE CORTE PARA DEFINIR CLASSIFICAÇÃO ENTRE 0 E 1 COM BOOTSTRAP 999

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

lista_prob=[]
lista_roc=[]
lista_mcc = []

## Realizar reamostragem
for i in range(999):
    X_train_b, y_train_b = resample(X_train, y_train, replace=True, stratify=y_train,  n_samples=len(y_train), random_state=None)
    
   

    
# Gerar o modelo
    model = LogisticRegression(solver='lbfgs',max_iter=10000, random_state=42, fit_intercept=True)
    model.fit(X_train_b, y_train_b)
    
# Realizar predição de probabilidades
    lr_probs = model.predict_proba(X_train_b)
    
# Monter a probailidde da classe de interesse para calcular a métrica AUC a seguir
    lr_probs = lr_probs[:, 1]
    
# Calcular a métrica AUC
    
    lr_auc = roc_auc_score(y_train_b, lr_probs)

# Calcular a curvas ROC da Regressão Logistica

    lr_fpr, lr_tpr, thresholds = roc_curve(y_train_b, lr_probs)

## Calcular média geométrica entre taxa de verdedaeiros positivos e ('1 - taxa de falsos positivos) para cada threshold 
## da curva ROC
    gmeans = np.sqrt(lr_tpr * (1-lr_fpr))

# Localizar o índice da maior média geométrica de cada curva ROC de cada modelo gerado, a prob. correspondente e a métrica AUC
    ix = np.argmax(gmeans)

    limite_prob = thresholds[ix]
    
    lista_prob.append(limite_prob)
   
    lista_roc.append(lr_auc)

 

  
   

In [15]:
## Criar uma lista em ordem decrescente em que cada elemento contém o par AUC e probabilidade de corte, gerados na célula acima,
## ordenada pela métrica AUC

lista_roc_prob = sorted(list(zip(lista_roc, lista_prob)), reverse=True)

In [16]:
### Criar um dataframe para visualizar as cinco maiores métricas AUC associadas às probabilidades de corte
data_roc_prob = pd.DataFrame(lista_roc_prob[0:5], columns=['ROC_AUC','Probabilidade'])
data_roc_prob

Unnamed: 0,ROC_AUC,Probabilidade
0,0.964386,0.364891
1,0.962142,0.321733
2,0.961906,0.434045
3,0.961718,0.420141
4,0.961693,0.453288


In [17]:
# Calcular as ponderações de cada métrica AUC e as probabilidades de corte ponderadas
data_roc_prob['Percentual_ROC'] = data_roc_prob['ROC_AUC']/data_roc_prob['ROC_AUC'].sum() 
data_roc_prob['Probabibidade_Ponderada'] = data_roc_prob['Percentual_ROC'] * data_roc_prob["Probabilidade"].sum() 
data_roc_prob

Unnamed: 0,ROC_AUC,Probabilidade,Percentual_ROC,Probabibidade_Ponderada
0,0.964386,0.364891,0.200419,0.399655
1,0.962142,0.321733,0.199953,0.398725
2,0.961906,0.434045,0.199904,0.398628
3,0.961718,0.420141,0.199865,0.39855
4,0.961693,0.453288,0.19986,0.398539


In [18]:
# Cálculo da média das cinco probabilidades ponderadas geradas na célula anterior
ROC_AUC_Ponderada = data_roc_prob['Probabibidade_Ponderada'].sum()/5
ROC_AUC_Ponderada

0.39881952939475035

In [19]:
### REGRESSÃO LOGÍSTICA ESTIMAÇÃO DE 999 MODELOS UTILIZANDO REAMOSTRAGEM BOOTSTRAP 
### E APLICAÇÃO AOS DADOS DE TESTE, UTILIZANDO A PROBABILIDADE DE CORTE CALCULADA NA CÉLULA ANTERIOR

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

metricas = pd.DataFrame()


log_mcc=[]
log_bacc = []
log_sens = []
log_spec=[]
log_TP = []
log_TN = []
log_FP =[]
log_FN = []
log_acc =[]
coefs =[]
intercept=[]

weights = {0:1, 1:45}

for i in range(999):
    
    # Realizar reamostragens
    X_train_b, y_train_b = resample(X_train, y_train, replace=True,  stratify=y_train,  n_samples=len(y_train), random_state=None)
  
  
    model = LogisticRegression(solver = 'lbfgs',  multi_class = 'auto', max_iter = 10000, random_state=42, fit_intercept = True)
    
    # Gerar o modelo a cada reamostragem
    model.fit(X_train_b, y_train_b)

   # Predição utilizando dados de teste    
    prob = model.predict_proba(X_test)
    
    # Aplicação da probabilidade de corte (classe 1 se maior ou igual que prob. corte ou classe 0 se menor que prob. corte)
     
    roc_predictions_teste = [1 if p >= 0.39881952939475035 else 0 for p in prob[:,1]]
    
    
    # Cálculo de métricas (MCC, Acurácia, Acurácia Balanceada)  
       
    log_bacc.append(metrics.balanced_accuracy_score(y_test, roc_predictions_teste, sample_weight=None, adjusted=False))
   
    log_mcc.append(metrics.matthews_corrcoef(y_test, roc_predictions_teste, sample_weight=None))
    
    log_acc.append(metrics.accuracy_score(y_test, roc_predictions_teste))
    
    
    # Inserir coeficientes dos modeos em lista
    
    coefs.append(model.coef_[0].tolist())
    
    intercept.append(model.intercept_[0])
    
# Matriz de confusão para cada 
    CM = metrics.confusion_matrix(y_test, roc_predictions_teste)

    TP = CM[0][0]
    FN = CM[0][1]
    TN = CM[1][1]
    FP = CM[1][0]

# Sensibilidade
    TPR = TP/(TP+FN)
    log_sens.append(TPR)
    
# Especificifdade
    TNR = TN/(TN+FP) 
    
    log_spec.append(TNR)
    log_TP.append(TP)
    log_FN.append(FN)
    log_TN.append(TN)
    log_FP.append(FP)

    
print('Média Acurácia Balanceada:',sum(log_bacc)/len(log_bacc))
print('Média Acurácia:',sum(log_acc)/len(log_acc))
print('Média MCC:',sum(log_mcc)/len(log_mcc))
print('Média Sensibilidade:', sum(log_sens)/len(log_sens))
print('Média Especificidade:', sum(log_spec)/len(log_spec))

metricas['MCC'] = log_mcc
metricas['Bal_Acc'] = log_bacc
metricas['Accuracy'] = log_acc
metricas['Sensitivity'] = log_sens
metricas['Specificity'] = log_spec
metricas['V_Pos']  = log_TP
metricas['V_Neg'] = log_TN
metricas['F_Pos'] = log_FP
metricas['F_Neg'] = log_FN


coeficientes = pd.DataFrame(coefs, columns=dados.columns[:-1])
coeficientes['Intercept'] = intercept

Média Acurácia Balanceada: 0.8412248532949007
Média Acurácia: 0.8312266677821759
Média MCC: 0.2570787312624452
Média Sensibilidade: 0.8307731272867608
Média Especificidade: 0.8516765793030275


In [20]:
## Visualização dos coeficientes de 10 modelos gerados

coeficientes.head(10)

Unnamed: 0,UF__AL,UF__AM,UF__AP,UF__BA,UF__CE,UF__ES,UF__GO,UF__MA,UF__MG,UF__MS,...,Perc_serv_est_alto,Perc_serv_est_superior,Perc_serv_pop_medio,Perc_serv_pop_normal,Perc_serv_pop_alto,Perc_serv_pop_superior,dum_serv,dum_bem,dum_outros,Intercept
0,0.173999,2.629011,1.927273,-2.032936,0.991192,-0.675525,0.015237,-0.495297,-0.658648,-0.82527,...,-0.411395,-0.690152,0.957151,0.508229,0.316673,-0.360307,4.310875,0.9186,4.117915,-1.406616
1,0.563014,1.925399,0.838383,-1.945798,0.162417,-0.795195,-0.351428,1.063425,-1.070898,-0.449359,...,-0.072156,-0.011109,0.433801,0.718008,1.142894,-0.123209,3.956767,0.331483,3.470462,-0.798628
2,0.18032,2.417449,2.15285,-1.728652,0.413598,-0.901947,-0.157825,0.950307,-0.385895,0.167797,...,-0.367482,0.044519,1.127614,1.024658,0.816716,-0.536987,3.690509,0.685638,3.386277,-0.456229
3,-0.874377,2.276535,1.519727,-1.729615,0.783044,-1.03164,-0.566099,1.212506,-1.144101,-0.950747,...,-0.277382,-0.249462,1.480321,1.401324,0.063788,0.080704,3.467623,0.559693,3.365052,-1.46247
4,0.067675,1.502948,1.825346,-1.795141,-0.224152,-1.147785,0.14254,-0.356854,-1.001803,-0.316973,...,-0.549632,-0.458499,0.962333,1.108898,-0.028448,-0.079715,3.703086,0.345501,4.165398,-0.439773
5,1.139004,1.494781,1.563891,-2.065403,0.177581,-0.603336,-0.541206,0.322505,-0.530364,-0.700441,...,-0.629709,-0.297946,1.13206,0.657713,0.773369,-0.280853,3.698392,0.562273,3.613839,-0.434817
6,0.496385,2.394691,1.53273,-2.524363,0.186881,-0.666629,-0.42666,-0.035463,-0.976595,0.416028,...,-0.879192,-0.332407,1.125614,0.830196,0.068509,-0.030313,4.653768,0.887311,3.841447,-0.747671
7,0.661248,1.015887,1.608609,-1.784312,0.529889,-0.852152,-0.729313,0.192246,-1.639919,-0.768562,...,-1.027762,-0.326229,0.888025,0.560288,0.436131,-0.095616,3.434292,0.474294,3.871532,-0.322097
8,-0.202859,2.184015,1.669584,-1.535053,0.799242,-0.689065,-0.697487,-0.053706,-0.92954,-0.029143,...,-0.499237,-0.320473,0.876607,0.988913,-0.331642,-0.201651,4.037489,0.459065,4.142773,-0.911069
9,-0.047582,1.976874,2.176958,-1.511275,0.705637,-1.006134,-0.302493,-0.111377,-0.709899,-0.292115,...,-0.417422,-0.388858,0.772376,0.496555,0.764425,0.030222,3.513681,0.693836,3.908609,-0.8845


In [21]:
## Visuluaização de métricas de 10 modelos
metricas.head(10)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg
0,0.265368,0.844861,0.841621,0.841474,0.848249,9751,218,39,1837
1,0.259582,0.845348,0.831406,0.830773,0.859922,9627,221,36,1961
2,0.255261,0.842931,0.826678,0.825941,0.859922,9571,221,36,2017
3,0.25092,0.844889,0.815618,0.814291,0.875486,9436,225,32,2152
4,0.246016,0.833035,0.822203,0.821712,0.844358,9522,217,40,2066
5,0.254154,0.845704,0.820937,0.819814,0.871595,9500,224,33,2088
6,0.276361,0.855232,0.847024,0.846652,0.863813,9811,222,35,1777
7,0.250071,0.836534,0.825327,0.824819,0.848249,9558,218,39,2030
8,0.275139,0.847184,0.853609,0.853901,0.840467,9895,216,41,1693
9,0.262888,0.849498,0.832081,0.831291,0.867704,9633,223,34,1955


In [22]:
# Adicionando a média geométrica entre especifcicidade e sensibilidade às métricas para posterior ordenamento decrescente
metricas['GMean_Sen_Spe'] = np.sqrt(metricas['Sensitivity']*metricas['Specificity'])

In [23]:
## Ordenação em ordem decrescente pela média geométrica calculada na célula anterior
metricas_ord = metricas.sort_values(by='GMean_Sen_Spe', ascending=False)

In [24]:
# Visualização das métricas ordenadas conforme célula anterior
metricas_ord.head(10)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg,GMean_Sen_Spe
724,0.292472,0.862611,0.861461,0.861408,0.863813,9982,222,35,1606,0.86261
26,0.291569,0.860924,0.861883,0.861926,0.859922,9988,221,36,1600,0.860924
72,0.27787,0.8609,0.843225,0.842423,0.879377,9762,226,31,1826,0.860702
189,0.288038,0.860669,0.857661,0.857525,0.863813,9937,222,35,1651,0.860663
434,0.278782,0.86012,0.84542,0.844753,0.875486,9789,225,32,1799,0.859983
308,0.273352,0.858613,0.838751,0.837849,0.879377,9709,226,31,1879,0.858362
208,0.267894,0.858103,0.830308,0.829047,0.88716,9607,228,29,1981,0.857611
925,0.274721,0.85688,0.842803,0.842164,0.871595,9759,224,33,1829,0.856753
318,0.274378,0.856707,0.842465,0.841819,0.871595,9755,224,33,1833,0.856578
386,0.269961,0.856844,0.835289,0.834311,0.879377,9668,226,31,1920,0.856548


In [25]:
### Verificar a significância estatística dos coeficientes dos com intervalo de confiança de 95%
stats_coef = pd.DataFrame()
media = np.mean(coeficientes)
desvio = np.std(coeficientes)
stats_coef['Média Coeficientes'] = media
stats_coef['Desvio-Padrão Coeficientes'] = desvio

li_95=[]
ls_95=[]
for x in coeficientes.columns:
    li_95.append(np.percentile(coeficientes[x], 2.5))
    ls_95.append(np.percentile(coeficientes[x], 97.5))
    
stats_coef['IC_95_LInf'] =li_95
stats_coef['IC_95_LSup'] =ls_95


stats_coef

Unnamed: 0,Média Coeficientes,Desvio-Padrão Coeficientes,IC_95_LInf,IC_95_LSup
UF__AL,0.164863,0.45165,-0.702566,1.083406
UF__AM,1.943238,0.361015,1.194385,2.619564
UF__AP,1.52117,0.499929,0.451082,2.355039
UF__BA,-1.904777,0.359883,-2.59287,-1.203643
UF__CE,0.452161,0.306901,-0.125377,1.071377
UF__ES,-0.847462,0.215185,-1.256427,-0.388018
UF__GO,-0.17115,0.335678,-0.810665,0.489559
UF__MA,0.104634,0.468116,-0.877713,0.969611
UF__MG,-0.830611,0.287701,-1.393385,-0.251949
UF__MS,-0.294735,0.381881,-1.028894,0.424078


In [50]:
### Exclusão de coeficientes sem significância estatística
dados_exc = dados.drop(columns=['Sem_Emp_Rais','Socio_emp_cont', 'Cont_Doa', 'val_fpm_medio', 'val_fpm_normal', 'val_fpm_alto',
       'val_fpm_superior' ])

In [51]:
## Converter os dados para formato numpy para facilitar o processamento e separação entre atributos (X) e alvo(Y):
data_numpy_exc = dados_exc.to_numpy()
nrow,ncol = dados_exc.shape
y_exc = data_numpy_exc[:,-1]
X_exc = data_numpy_exc[:,0:ncol-1]

In [52]:
## Dividir entre treino e teste.

from sklearn.model_selection import train_test_split

X_train_exc, X_test_exc, y_train_exc, y_test_exc = train_test_split(X_exc, y_exc, test_size = 0.3, stratify = y_exc, random_state = 0)

print('Shape X_train:', X_train_exc.shape)
print('Shape X_test:', X_test_exc.shape)
print('Shape y_train:', y_train_exc.shape)
print('Shape y_test:', y_test_exc.shape)

Shape X_train: (27637, 48)
Shape X_test: (11845, 48)
Shape y_train: (27637,)
Shape y_test: (11845,)


In [53]:
np.random.seed(42)

#### Executar o balanceamento das classes utilizando undersampling após exclusão de coeficientes

rus = RandomUnderSampler(sampling_strategy=1)
X_train_exc, y_train_exc = rus.fit_sample(X_train_exc, y_train_exc)

print('X_train_under:',X_train_exc.shape, 'y_train_under:', y_train_exc.shape)  
print('Soma y_train_under (Classe 1):', y_train_exc.sum())

print('X_test:', X_test_exc.shape)
print('y_test:', y_test_exc.shape)
print('Soma y_test (Classe 1):', y_test_exc.sum())


X_train_under: (1198, 48) y_train_under: (1198,)
Soma y_train_under (Classe 1): 599.0
X_test: (11845, 48)
y_test: (11845,)
Soma y_test (Classe 1): 257.0


In [65]:
### CURVA ROC PARA DETERMINAR A PROBABILIDADE DE CORTE PARA DEFINIR CLASSIFICAÇÃO ENTRE 0 E 1 COM BOOTSTRAP 999

np.random.seed(42)

lista_probx=[]
lista_rocx=[]
lista_mccx = []


for i in range(999):
    
## Realizar  reamostragem    
    X_train_bx, y_train_bx = resample(X_train_exc, y_train_exc, replace=True, stratify=y_train_exc,  n_samples=len(y_train_exc), random_state=None)
    

# Gerar o modelo de cada reamostargem 

    model = LogisticRegression(solver='lbfgs',max_iter=10000, random_state=42, fit_intercept=False)
    model.fit(X_train_bx, y_train_bx)
    
# Predizer probabilidades

    lr_probs = model.predict_proba(X_train_bx)
    
# Monter a probailidde da classe de interesse para calcular a métrica AUC a seguir
    lr_probs = lr_probs[:, 1]

# Calcular a métrica AUC
   
    lr_auc = roc_auc_score(y_train_bx, lr_probs)


# Calcular a curvas ROC de cada modelo de regressão Logística
  
    lr_fpr, lr_tpr, thresholds = roc_curve(y_train_bx, lr_probs)

    
## Calcular média geométrica entre taxa de verdedaeiros positivos e ('1 - taxa de falsos positivos) para cada threshold 
## da curva ROC
    gmeans = np.sqrt(lr_tpr * (1-lr_fpr))

# Localizar o índice da maior média geométrica de cada curva ROC de cada modelo gerado, a prob. correspondente e a métrica AUC

    ix = np.argmax(gmeans)

    limite_prob = thresholds[ix]
    
    lista_probx.append(limite_prob)
  
    lista_rocx.append(lr_auc)


In [66]:
## Criar uma lista em ordem decrescente em que cada elemento contém o par AUC e probabilidade de corte, gerados na célula acima,
## ordenada pela métrica AUC

lista_roc_prob_x = sorted(list(zip(lista_rocx, lista_probx)), reverse=True)

In [67]:
### Criar um dataframe para visualizar as cinco maiores métricas AUC associadas às probabilidades de corte
data_roc_prob_x = pd.DataFrame(lista_roc_prob_x[0:5], columns=['ROC_AUC','Probabilidade'])
data_roc_prob_x

Unnamed: 0,ROC_AUC,Probabilidade
0,0.963431,0.37159
1,0.960878,0.402683
2,0.960786,0.406335
3,0.960567,0.376278
4,0.959988,0.398502


In [68]:
# Calcular as ponderações de cada métrica AUC e as probabilidades de corte ponderadas
data_roc_prob_x['Percentual_ROC'] = data_roc_prob_x['ROC_AUC']/data_roc_prob_x['ROC_AUC'].sum() 
data_roc_prob_x['Probabibidade_Ponderada'] = data_roc_prob_x['Percentual_ROC'] * data_roc_prob_x["Probabilidade"].sum() 
data_roc_prob_x

Unnamed: 0,ROC_AUC,Probabilidade,Percentual_ROC,Probabibidade_Ponderada
0,0.963431,0.37159,0.200479,0.392014
1,0.960878,0.402683,0.199948,0.390975
2,0.960786,0.406335,0.199928,0.390938
3,0.960567,0.376278,0.199883,0.390849
4,0.959988,0.398502,0.199762,0.390613


In [69]:
# Cálculo da média das cinco probabilidades ponderadas geradas na célula anterior
ROC_AUC_Ponderada_x = data_roc_prob_x['Probabibidade_Ponderada'].sum()/5
ROC_AUC_Ponderada_x

0.39107768815102084

In [64]:
### REGRESSÃO LOGÍSTICA ESTIMAÇÃO DE DIVERSOS MODELOS COM BOOTSTRAP E DADOS DE TESTE PARA 999 REAMOSTRAGENS

np.random.seed(42)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

metricas_x = pd.DataFrame()


log_mccx=[]
log_baccx = []
log_sensx = []
log_specx=[]
log_TPx = []
log_TNx = []
log_FPx =[]
log_FNx = []
log_accx =[]
coefsx =[]
interceptx=[]

weights = {0:1, 1:45}

for i in range(999):
    X_train_bx, y_train_bx = resample(X_train_exc, y_train_exc, replace=True,  stratify=y_train_exc,  n_samples=len(y_train_exc), random_state=None)
  
    # X_test_b, y_test_b = resample(X_test, y_test, replace=True , stratify=y_test, n_samples=len(y_test), random_state=None)
    
    modelx = LogisticRegression(solver = 'lbfgs',  multi_class = 'auto', max_iter = 1000, random_state=42, fit_intercept=False)
    
    modelx.fit(X_train_bx, y_train_bx)

    # Predição utilizando dados de teste  
    
    probx = modelx.predict_proba(X_test_exc)

   
    # Aplicação da probabilidade de corte (classe 1 se maior ou igual que prob. corte ou classe 0 se menor que prob. corte)
    
    roc_predictions_testex = [1 if p >= 0.39107768817574284 else 0 for p in probx[:,1]]

    ## Calculo de métricas (MCC, Acurácia, Acurácia Balanceada)    
    log_baccx.append(metrics.balanced_accuracy_score(y_test_exc, roc_predictions_testex, sample_weight=None, adjusted=False))
    
    log_mccx.append(metrics.matthews_corrcoef(y_test_exc, roc_predictions_testex, sample_weight=None))
    
    log_accx.append(metrics.accuracy_score(y_test_exc, roc_predictions_testex))
    
    
    ## Inserir coeficientes dos modeos em lista
    
    coefsx.append(modelx.coef_[0].tolist())
    
    interceptx.append(modelx.intercept_[0])
    
   # Matriz de Confusão 
    
    CMx = metrics.confusion_matrix(y_test_exc, roc_predictions_testex)

    TPx = CMx[0][0]
    FNx = CMx[0][1]
    TNx = CMx[1][1]
    FPx = CMx[1][0]

# Sensibildiade
    TPRx = TPx/(TPx+FNx)
    log_sensx.append(TPRx)
    
# Especificidade
    TNRx = TNx/(TNx+FPx) 
    
    log_specx.append(TNRx)
    log_TPx.append(TPx)
    log_FNx.append(FNx)
    log_TNx.append(TNx)
    log_FPx.append(FPx)

    
print('Média Acurácia Balanceada:',sum(log_baccx)/len(log_baccx))
print('Média Acurácia:',sum(log_accx)/len(log_accx))
print('Média MCC:',sum(log_mccx)/len(log_mccx))
print('Média Sensibilidade:', sum(log_sensx)/len(log_sensx))
print('Média Especificidade:', sum(log_specx)/len(log_specx))

metricas_x['MCC'] = log_mccx
metricas_x['Bal_Acc'] = log_baccx
metricas_x['Accuracy'] = log_accx
metricas_x['Sensitivity'] = log_sensx
metricas_x['Specificity'] = log_specx
metricas_x['V_Pos']  = log_TPx
metricas_x['V_Neg'] = log_TNx
metricas_x['F_Pos'] = log_FPx
metricas_x['F_Neg'] = log_FNx


coeficientes_x = pd.DataFrame(coefsx, columns=dados_exc.columns[:-1])
coeficientes_x['Intercept'] = interceptx

Média Acurácia Balanceada: 0.8426695186070906
Média Acurácia: 0.8268473623475736
Média MCC: 0.25570111710999566
Média Sensibilidade: 0.8261296332576971
Média Especificidade: 0.8592094039564774


In [70]:
# Adicionando a média geométrica entre especifcicidade e sensibilidade às métricas para posterior ordenamento decrescente
metricas_x['GMean_Sen_Spe'] = np.sqrt(metricas_x['Sensitivity']*metricas_x['Specificity'])

In [71]:
## Ordenação em ordem decrescente pela média geométrica calculada na célula anterior
metricas_ord_x = metricas_x.sort_values(by='GMean_Sen_Spe', ascending=False)

In [72]:
# Visualização das métricas ordenadas conforme célula anterior
metricas_ord_x.head(10)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg,GMean_Sen_Spe
26,0.286988,0.862748,0.854285,0.853901,0.871595,9895,224,33,1693,0.862703
386,0.269682,0.861434,0.829379,0.827925,0.894942,9594,230,27,1994,0.860782
755,0.278342,0.859904,0.844998,0.844322,0.875486,9784,225,32,1804,0.859763
724,0.285732,0.859634,0.855635,0.855454,0.863813,9913,222,35,1675,0.859623
705,0.271256,0.859916,0.833854,0.832672,0.88716,9649,228,29,1939,0.859484
208,0.266798,0.859837,0.826256,0.824732,0.894942,9557,230,27,2031,0.85912
255,0.281608,0.85899,0.850654,0.850276,0.867704,9853,223,34,1735,0.858946
382,0.274361,0.859131,0.839764,0.838885,0.879377,9721,226,31,1867,0.858893
189,0.283841,0.858771,0.853947,0.853728,0.863813,9893,222,35,1695,0.858756
318,0.27394,0.858916,0.839341,0.838454,0.879377,9716,226,31,1872,0.858672


In [74]:
### Verificar a significância estatística dos coeficientes dos com intervalo de confiança de 95%

stats_coef_x = pd.DataFrame()
media = np.mean(coeficientes_x)
desvio = np.std(coeficientes_x)
stats_coef_x['Média Coeficientes'] = media
stats_coef_x['Desvio-Padrão Coeficientes'] = desvio

li_95_x=[]
ls_95_x=[]
for x in coeficientes_x.columns:
    li_95_x.append(np.percentile(coeficientes_x[x], 2.5))
    ls_95_x.append(np.percentile(coeficientes_x[x], 97.5))
    
stats_coef_x['IC_95_LInf'] =li_95_x
stats_coef_x['IC_95_LSup'] =ls_95_x


stats_coef_x

Unnamed: 0,Média Coeficientes,Desvio-Padrão Coeficientes,IC_95_LInf,IC_95_LSup
UF__AL,0.057081,0.450548,-0.781525,0.968982
UF__AM,1.951879,0.366204,1.219264,2.63026
UF__AP,1.521403,0.501156,0.408618,2.361522
UF__BA,-1.940738,0.356346,-2.62425,-1.269103
UF__CE,0.441838,0.311555,-0.139233,1.050319
UF__ES,-0.872449,0.21275,-1.282783,-0.417759
UF__GO,-0.204779,0.339062,-0.842019,0.447439
UF__MA,0.107657,0.469269,-0.860282,0.965086
UF__MG,-0.84504,0.286788,-1.408311,-0.271482
UF__MS,-0.28156,0.385072,-1.014164,0.456227
