In [63]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
sns.set_style('darkgrid')
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, balanced_accuracy_score, matthews_corrcoef
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve

In [64]:
dataset = pd.read_csv('dados_seg_geo.csv', sep ='|', encoding='utf8', index_col=0)

In [65]:
## Segmentar os dados por região geográfica do Brasil

dataset_co = dataset[dataset['REGIAO']=='CO']
dataset_ne = dataset[dataset['REGIAO']=='NE']
dataset_sul = dataset[dataset['REGIAO']=='SUL']
dataset_no = dataset[dataset['REGIAO']=='NO']
dataset_se = dataset[dataset['REGIAO']=='SE']
 
dataset_co_se_sul = dataset[(dataset['REGIAO']=='CO') | (dataset['REGIAO']=='SE') | (dataset['REGIAO']=='SUL')] 
dataset_ne_no = dataset[(dataset['REGIAO']=='NE') | (dataset['REGIAO']=='NO')]

In [66]:
# Exibir a quantidade e o percentual de registros em cada região
print("----------------------------------Regiões-----------------------------------------")
print("CO: ",dataset_co.COD_PC.sum(),'Total:', dataset_co.shape[0],'Percentual:', dataset_co.COD_PC.sum()/dataset_co.shape[0])
print("NE: ",dataset_ne.COD_PC.sum(),'Total:', dataset_ne.shape[0],'Percentual:', dataset_ne.COD_PC.sum()/dataset_ne.shape[0])
print("NO: ",dataset_no.COD_PC.sum(),'Total:', dataset_no.shape[0],'Percentual:', dataset_no.COD_PC.sum()/dataset_no.shape[0])
print("SUL: ",dataset_sul.COD_PC.sum(),'Total:', dataset_sul.shape[0],'Percentual:', dataset_sul.COD_PC.sum()/dataset_sul.shape[0])
print("SE: ",dataset_se.COD_PC.sum(),'Total:', dataset_se.shape[0],'Percentual:', dataset_se.COD_PC.sum()/dataset_se.shape[0])
print("------------------------- Grupos de Regiões ------------------------------------------")
print("Total de PC rejeitadas em CO/SE/SUL: ",dataset_co_se_sul.COD_PC.sum(),'Total de convênios:', dataset_co_se_sul.shape[0],'PC Reprovadas/Total:', dataset_co_se_sul.COD_PC.sum()/dataset_co_se_sul.shape[0])
print("Total de PC rejeitadas em NE/NO: ",dataset_ne_no.COD_PC.sum(),'Total de convênios: :', dataset_ne_no.shape[0],'PC Reprovadas/Total:', dataset_ne_no.COD_PC.sum()/dataset_ne_no.shape[0])


----------------------------------Regiões-----------------------------------------
CO:  74.0 Total: 2877 Percentual: 0.025721237400069517
NE:  358.0 Total: 5935 Percentual: 0.0603201347935973
NO:  171.0 Total: 3133 Percentual: 0.054580274497286946
SUL:  86.0 Total: 16060 Percentual: 0.005354919053549191
SE:  167.0 Total: 11477 Percentual: 0.0145508408120589
------------------------- Grupos de Regiões ------------------------------------------
Total de PC rejeitadas em CO/SE/SUL:  327.0 Total de convênios: 30414 PC Reprovadas/Total: 0.010751627539948707
Total de PC rejeitadas em NE/NO:  529.0 Total de convênios: : 9068 PC Reprovadas/Total: 0.05833700926334363


In [86]:
### Dados CO/SE/SUL - São eliminados estados das regiões Norte e Nordeste, bem como o identificador
### da região geográfica além de um dos estados do grupo CO/SE/SUL para evitar ocorrência de multicolinearidde.

dados_cosesul = dataset_co_se_sul.drop(columns=['REGIAO','UF__AC','UF__AL', 'UF__AM', 'UF__AP','UF__BA', 'UF__CE', 'UF__ES', 'UF__MA',  'UF__PA', 'UF__PB', 'UF__PE', 'UF__PI', 'UF__RN', 'UF__RO', 'UF__RR', 'UF__SE',  'UF__TO'])

In [68]:
### Dados NE/NO - São eliminados estados das regiões Centro-Oeste, Sul e Sudeste, bem como 
### o identificador da região geográfica, além de um dos estados do grupo NE/NO para evitar ocorrência de multicolinearidde.

dados_neno = dataset_ne_no.drop(columns=['REGIAO','UF__ES','UF__RJ', 'UF__MG', 'UF__SP','UF__PR', 'UF__SC', 'UF__RS', 'UF__GO', 'UF__MT', 'UF__MS', 'UF__AL'])

In [87]:
## Converter os dados para formato numpy para facilitar o processamento e separação entre atributos (X) e alvo(Y):
## Norte e Nordeste
data_numpy_neno = dados_neno.to_numpy()
nrow,ncol = dados_neno.shape
y_neno = data_numpy_neno[:,-1]
X_neno = data_numpy_neno[:,0:ncol-1]

## Centro-Oeste, Sul e Sudeste
data_numpy_cosesul = dados_cosesul.to_numpy()
nrow,ncol = dados_cosesul.shape
y_cosesul = data_numpy_cosesul[:,-1]
X_cosesul = data_numpy_cosesul[:,0:ncol-1]


In [88]:
## Dividir entre treino e teste antes de fazer a padronização para que a informação contida no conjunto 
## de teste (que está junto com o treino antes da divisão) não influencie na transformação dos dados do conjunto de treino.

from sklearn.model_selection import train_test_split


# Norte e Nordeste
X_train_neno, X_test_neno, y_train_neno, y_test_neno = train_test_split(X_neno, y_neno, test_size = 0.3, stratify = y_neno, random_state = 0)
print('Norte e Nordeste')
print('Shape X_train:', X_train_neno.shape)
print('Shape X_test:', X_test_neno.shape)
print('Shape y_train:', y_train_neno.shape)
print('Shape y_test:', y_test_neno.shape)
print()

# Centro-Oeste, Sul e Sudeste
X_train_cosesul, X_test_cosesul, y_train_cosesul, y_test_cosesul = train_test_split(X_cosesul, y_cosesul, test_size = 0.3, stratify = y_cosesul, random_state = 0)
print('Centro-Oeste, Sul e Sudeste')
print('Shape X_train:', X_train_cosesul.shape)
print('Shape X_test:', X_test_cosesul.shape)
print('Shape y_train:', y_train_cosesul.shape)
print('Shape y_test:', y_test_cosesul.shape)
print()




Norte e Nordeste
Shape X_train: (6347, 45)
Shape X_test: (2721, 45)
Shape y_train: (6347,)
Shape y_test: (2721,)

Centro-Oeste, Sul e Sudeste
Shape X_train: (21289, 39)
Shape X_test: (9125, 39)
Shape y_train: (21289,)
Shape y_test: (9125,)



In [71]:
np.random.seed(42)

#### Executar o balanceamento das classes utilizando undersampling
# Norte e Nordeste

rus = RandomUnderSampler(sampling_strategy=1)


X_train_neno, y_train_neno = rus.fit_sample(X_train_neno, y_train_neno)
#X_test, y_test = rus.fit_sample(X_test, y_test)
print('X_train_under:',X_train_neno.shape, 'y_train_under:', y_train_neno.shape)  
print('Soma y_train_under (Classe 1):', y_train_neno.sum())
#print('X_test_under:',X_test.shape, 'y_test_under:', y_test.shape)  
#print('Soma y_test_under:', y_test.sum())
print('X_test:', X_test_neno.shape)
print('y_test:', y_test_neno.shape)
print('Soma y_test (Classe 1):', y_test_neno.sum())

X_train_under: (740, 45) y_train_under: (740,)
Soma y_train_under (Classe 1): 370.0
X_test: (2721, 45)
y_test: (2721,)
Soma y_test (Classe 1): 159.0


In [89]:
np.random.seed(42)

#### Executar o balanceamento das classes utilizando undersampling
# Centro-Oeste, Sul e Sudeste

rus1 = RandomUnderSampler(sampling_strategy=1)

X_train_cosesul, y_train_cosesul = rus1.fit_sample(X_train_cosesul, y_train_cosesul)

print('X_train_under:',X_train_cosesul.shape, 'y_train_under:', y_train_cosesul.shape)  
print('Soma y_train_under (Classe 1):', y_train_cosesul.sum())
print('X_test:', X_test_cosesul.shape)
print('y_test:', y_test_cosesul.shape)
print('Soma y_test (Classe 1):', y_test_cosesul.sum())

X_train_under: (458, 39) y_train_under: (458,)
Soma y_train_under (Classe 1): 229.0
X_test: (9125, 39)
y_test: (9125,)
Soma y_test (Classe 1): 98.0


In [72]:
## Importar o módulo resample para realizar reamostragens utlizando a tecnica de Bootstrapping
from sklearn.utils import resample

In [73]:
## Importar o algoritimo de regressão rogística e o módulo 'metrics' do scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### MODELOS PARA NORTE E NORDESTE

In [74]:
### CURVA ROC PARA DETERMINAR A PROBABILIDADE DE CORTE PARA DEFINIR CLASSIFICAÇÃO ENTRE 0 E 1 COM BOOTSTRAP 999
### NORTE E NORDESTE

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

lista_prob=[]
lista_roc=[]

## Realizar reamostragem
for i in range(999):
    X_train_b, y_train_b = resample(X_train_neno, y_train_neno, replace=True, stratify = y_train_neno,  n_samples=len(y_train_neno), random_state=None)
    
   
    
# Gerar o modelo
    model_1 = LogisticRegression(solver ='lbfgs', max_iter = 10000, random_state = 42, fit_intercept = True)
    
    model_1.fit(X_train_b, y_train_b)
    
# Realizar predição de probabilidades
    lr_probs = model_1.predict_proba(X_train_b)
    
# Monter a probailidde da classe de interesse para calcular a métrica AUC a seguir
    lr_probs = lr_probs[:, 1]
    
# Calcular a métrica AUC
    
    lr_auc = roc_auc_score(y_train_b, lr_probs)

# Calcular a curva ROC da Regressão Logistica

    lr_fpr, lr_tpr, thresholds = roc_curve(y_train_b, lr_probs)

## Calcular média geométrica entre taxa de verdedaeiros positivos e ('1 - taxa de falsos positivos) para cada threshold 
## da curva ROC
    gmeans = np.sqrt(lr_tpr * (1-lr_fpr))

# Localizar o índice da maior média geométrica de cada curva ROC de cada modelo gerado, a prob. correspondente e a métrica AUC
    ix = np.argmax(gmeans)

    limite_prob = thresholds[ix]
    
    lista_prob.append(limite_prob)
   
    lista_roc.append(lr_auc)

 

In [75]:
## Criar uma lista em ordem decrescente em que cada elemento contém o par AUC e probabilidade de corte, gerados na célula acima,
## ordenada pela métrica AUC

lista_roc_prob_neno = sorted(list(zip(lista_roc, lista_prob)), reverse=True)

In [76]:
### Criar um dataframe para visualizar as cinco maiores métricas AUC associadas às probabilidades de corte
data_roc_prob_neno = pd.DataFrame(lista_roc_prob_neno[0:5], columns=['ROC_AUC','Probabilidade'])
data_roc_prob_neno

Unnamed: 0,ROC_AUC,Probabilidade
0,0.958419,0.485296
1,0.956041,0.581323
2,0.954503,0.586442
3,0.953663,0.570554
4,0.951991,0.495941


In [77]:
# Calcular as ponderações de cada métrica AUC e as probabilidades de corte ponderadas
data_roc_prob_neno['Percentual_ROC'] = data_roc_prob_neno['ROC_AUC']/data_roc_prob_neno['ROC_AUC'].sum() 
data_roc_prob_neno['Probabibidade_Ponderada'] = data_roc_prob_neno['Percentual_ROC'] * data_roc_prob_neno["Probabilidade"].sum() 
data_roc_prob_neno

Unnamed: 0,ROC_AUC,Probabilidade,Percentual_ROC,Probabibidade_Ponderada
0,0.958419,0.485296,0.200732,0.545902
1,0.956041,0.581323,0.200234,0.544548
2,0.954503,0.586442,0.199912,0.543672
3,0.953663,0.570554,0.199736,0.543194
4,0.951991,0.495941,0.199386,0.542241


In [78]:
# Cálculo da média das cinco probabilidades ponderadas geradas na célula anterior
ROC_AUC_Ponderada_NENO = data_roc_prob_neno['Probabibidade_Ponderada'].sum()/5
ROC_AUC_Ponderada_NENO

0.5439114536834198

In [79]:
### REGRESSÃO LOGÍSTICA ESTIMAÇÃO DE 999 MODELOS UTILIZANDO REAMOSTRAGEM BOOTSTRAP 
### E APLICAÇÃO AOS DADOS DE TESTE, UTILIZANDO A PROBABILIDADE DE CORTE CALCULADA NA CÉLULA ANTERIOR
### DADOS DAS REGIÕES NORTE E NORDESTE

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

metricas_neno = pd.DataFrame()


log_mcc=[]
log_bacc = []
log_sens = []
log_spec=[]
log_TP = []
log_TN = []
log_FP =[]
log_FN = []
log_acc =[]
coefs =[]
intercept=[]


for i in range(999):
    
    # Realizar reamostragens
    X_train_b, y_train_b = resample(X_train_neno, y_train_neno, replace=True,  stratify=y_train_neno,  n_samples=len(y_train_neno), random_state=None)
  
  
    model_2 = LogisticRegression(solver = 'lbfgs',  multi_class = 'auto', max_iter = 10000, random_state=42, fit_intercept = True)
    
    # Gerar o modelo a cada reamostragem
    model_2.fit(X_train_b, y_train_b)

   # Predição utilizando dados de teste    
    prob = model_2.predict_proba(X_test_neno)
    
    # Aplicação da probabilidade de corte (classe 1 se maior ou igual que prob. corte ou classe 0 se menor que prob. corte)
     
    roc_predictions_teste = [1 if p >= 0.5439114536834198 else 0 for p in prob[:,1]]
    
    
    # Cálculo de métricas (MCC, Acurácia, Acurácia Balanceada)  
       
    log_bacc.append(metrics.balanced_accuracy_score(y_test_neno, roc_predictions_teste, sample_weight=None, adjusted=False))
   
    log_mcc.append(metrics.matthews_corrcoef(y_test_neno, roc_predictions_teste, sample_weight=None))
    
    log_acc.append(metrics.accuracy_score(y_test_neno, roc_predictions_teste))
    
    
    # Inserir coeficientes dos modeos em lista
    
    coefs.append(model_2.coef_[0].tolist())
    
    intercept.append(model_2.intercept_[0])
    
# Matriz de confusão para cada 
    CM = metrics.confusion_matrix(y_test_neno, roc_predictions_teste)

    TP = CM[0][0]
    FN = CM[0][1]
    TN = CM[1][1]
    FP = CM[1][0]

# Sensibilidade
    TPR = TP/(TP+FN)
    log_sens.append(TPR)
    
# Especificifdade
    TNR = TN/(TN+FP) 
    
    log_spec.append(TNR)
    log_TP.append(TP)
    log_FN.append(FN)
    log_TN.append(TN)
    log_FP.append(FP)

    
print('Média Acurácia Balanceada:',sum(log_bacc)/len(log_bacc))
print('Média Acurácia:',sum(log_acc)/len(log_acc))
print('Média MCC:',sum(log_mcc)/len(log_mcc))
print('Média Sensibilidade:', sum(log_sens)/len(log_sens))
print('Média Especificidade:', sum(log_spec)/len(log_spec))

metricas_neno['MCC'] = log_mcc
metricas_neno['Bal_Acc'] = log_bacc
metricas_neno['Accuracy'] = log_acc
metricas_neno['Sensitivity'] = log_sens
metricas_neno['Specificity'] = log_spec
metricas_neno['V_Pos']  = log_TP
metricas_neno['V_Neg'] = log_TN
metricas_neno['F_Pos'] = log_FP
metricas_neno['F_Neg'] = log_FN


coeficientes_neno = pd.DataFrame(coefs, columns=dados_neno.columns[:-1])
coeficientes_neno['Intercept'] = intercept

Média Acurácia Balanceada: 0.7816167433396011
Média Acurácia: 0.7883374002447884
Média MCC: 0.3079978087613389
Média Sensibilidade: 0.7892267755655741
Média Especificidade: 0.7740067111136294


In [80]:
## Visualização dos coeficientes de 10 modelos gerados

coeficientes_neno.head(10)

Unnamed: 0,UF__AC,UF__AM,UF__AP,UF__BA,UF__CE,UF__MA,UF__PA,UF__PB,UF__PE,UF__PI,...,Perc_serv_est_alto,Perc_serv_est_superior,Perc_serv_pop_medio,Perc_serv_pop_normal,Perc_serv_pop_alto,Perc_serv_pop_superior,dum_serv,dum_bem,dum_outros,Intercept
0,0.067624,1.544957,1.578884,-2.409142,0.588999,0.631612,0.464962,0.611206,0.465696,-1.134152,...,-0.499858,0.299611,1.100928,1.115329,-0.119373,0.204751,2.992474,-0.429984,2.428432,-1.344507
1,0.936714,0.929725,1.669188,-1.473414,0.424214,0.366607,0.038775,-0.059219,-0.049726,-1.461658,...,-0.326477,0.012292,1.433008,1.075673,-0.370847,0.495987,3.036213,0.222054,2.638493,-1.291943
2,0.392426,0.489878,1.719601,-1.990402,0.717143,-0.200518,0.160736,0.995791,0.078491,-1.422119,...,-0.14857,0.144802,0.968128,0.722796,0.432526,0.684881,3.062805,-0.302702,2.268417,-0.76226
3,1.108697,0.735845,1.218886,-1.840333,0.723116,-0.541286,-0.278204,0.474754,0.495762,-1.347107,...,-0.50385,-0.223577,0.724963,0.379417,0.636591,0.344314,3.577645,-0.103309,2.182969,0.315944
4,0.416506,0.691596,1.498033,-1.869164,0.484103,0.157986,-0.547202,0.495542,0.367741,-0.742386,...,-0.318269,-0.046151,0.934853,1.296564,-0.425347,0.861515,2.932935,-0.473518,2.514168,-0.841876
5,0.700481,1.286022,1.123577,-1.62656,0.493495,0.328365,0.152063,0.624142,0.211697,-0.848453,...,-0.296067,-0.206248,0.400105,0.65333,-0.230266,0.620692,2.826067,-0.449328,2.598035,0.204669
6,0.654874,1.743498,0.39372,-1.910623,0.799779,0.573963,0.03275,0.661964,0.268343,-1.15967,...,0.149415,0.49211,1.245741,1.144113,0.360204,0.462631,2.954886,0.193586,2.441233,-1.822267
7,0.571795,0.12373,1.246886,-2.559125,0.736707,0.31762,0.087874,-0.224268,0.084466,-2.150647,...,-0.315178,0.012582,0.516099,0.691281,0.040787,0.459751,2.222707,-0.352523,2.188139,0.481922
8,1.579385,0.155048,0.976028,-1.749304,0.833466,0.425556,-0.162392,0.738879,0.258965,-1.267453,...,-0.478411,-0.005598,1.503188,1.423181,0.451656,0.760051,3.140979,0.043367,2.462153,-0.828202
9,0.386267,1.18485,2.044104,-1.892082,0.62007,0.018522,-0.149208,0.006062,-0.072659,-1.462757,...,-0.51534,-0.15369,0.219515,0.528481,-0.075454,0.645641,2.761268,0.03501,2.695435,0.278854


In [81]:
# Adicionando a média geométrica entre especifcicidade e sensibilidade às métricas para posterior ordenamento decrescente
metricas_neno['GMean_Sen_Spe'] = np.sqrt(metricas_neno['Sensitivity']*metricas_neno['Specificity'])

In [82]:
## Ordenação em ordem decrescente pela média geométrica calculada na célula anterior
metricas_ord_neno = metricas_neno.sort_values(by='GMean_Sen_Spe', ascending=False)

In [83]:
# Visualização das métricas ordenadas conforme célula anterior
metricas_ord_neno.head(10)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg,GMean_Sen_Spe
216,0.352331,0.815736,0.808526,0.807572,0.823899,2069,131,28,493,0.815695
933,0.344796,0.815194,0.796398,0.793911,0.836478,2034,133,26,528,0.814917
502,0.349849,0.814565,0.806321,0.80523,0.823899,2063,131,28,499,0.814511
498,0.340369,0.814436,0.789416,0.786105,0.842767,2014,134,25,548,0.813943
550,0.34862,0.813979,0.805219,0.804059,0.823899,2060,131,28,502,0.813919
211,0.339992,0.814241,0.789048,0.785714,0.842767,2013,134,25,549,0.813741
662,0.346992,0.813199,0.803749,0.802498,0.823899,2056,131,28,506,0.813128
538,0.346954,0.811615,0.806321,0.805621,0.81761,2064,130,29,498,0.811593
542,0.340754,0.81166,0.795296,0.79313,0.830189,2032,132,27,530,0.811448
160,0.347781,0.810422,0.809629,0.809524,0.811321,2074,129,30,488,0.810422


In [84]:
### Verificar a significância estatística dos coeficientes dos com intervalo de confiança de 95%
## Dados do grupo Norte e Nordeste

stats_coef_neno = pd.DataFrame()
media = np.mean(coeficientes_neno)
desvio = np.std(coeficientes_neno)
stats_coef_neno['Média Coeficientes'] = media
stats_coef_neno['Desvio-Padrão Coeficientes'] = desvio

li_95=[]
ls_95=[]
for x in coeficientes_neno.columns:
    li_95.append(np.percentile(coeficientes_neno[x], 2.5))
    ls_95.append(np.percentile(coeficientes_neno[x], 97.5))
    
stats_coef_neno['IC_95_LInf'] =li_95
stats_coef_neno['IC_95_LSup'] =ls_95


stats_coef_neno

Unnamed: 0,Média Coeficientes,Desvio-Padrão Coeficientes,IC_95_LInf,IC_95_LSup
UF__AC,0.769605,0.304739,0.215439,1.388641
UF__AM,0.942498,0.408831,0.163529,1.741399
UF__AP,1.332051,0.61095,0.039212,2.407966
UF__BA,-1.918012,0.260582,-2.440123,-1.422185
UF__CE,0.465797,0.269031,-0.061536,0.976021
UF__MA,0.437703,0.376755,-0.279387,1.174598
UF__PA,-0.077109,0.363232,-0.796289,0.634191
UF__PB,0.480682,0.313582,-0.122716,1.136801
UF__PE,0.20945,0.309012,-0.395556,0.806387
UF__PI,-1.342434,0.378141,-2.080464,-0.585381


### MODELOS PARA CENTRO-OESTE, SUL E SUDESTE

In [90]:
### CURVA ROC PARA DETERMINAR A PROBABILIDADE DE CORTE PARA DEFINIR CLASSIFICAÇÃO ENTRE 0 E 1 COM BOOTSTRAP 999
### CENTRO-OESTE, SUL E SUDESTE

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

lista_prob_1=[]
lista_roc_1=[]


## Realizar reamostragem
for i in range(999):
    X_train_c, y_train_c = resample(X_train_cosesul, y_train_cosesul, replace=True, stratify=y_train_cosesul,  n_samples=len(y_train_cosesul), random_state=None)
    
   
    
# Gerar o modelo
    model_3 = LogisticRegression(solver='lbfgs',max_iter=10000, random_state=42, fit_intercept=True)
    model_3.fit(X_train_c, y_train_c)
    
# Realizar predição de probabilidades
    lr_probs = model_3.predict_proba(X_train_c)
    
# Monter a probabilidade da classe de interesse para calcular a métrica AUC a seguir
    lr_probs = lr_probs[:, 1]
    
# Calcular a métrica AUC
    
    lr_auc = roc_auc_score(y_train_c, lr_probs)

# Calcular a curva ROC da Regressão Logistica


    lr_fpr, lr_tpr, thresholds = roc_curve(y_train_c, lr_probs)

## Calcular média geométrica entre taxa de verdedaeiros positivos e ('1 - taxa de falsos positivos) para cada threshold 
## da curva ROC
    gmeans = np.sqrt(lr_tpr * (1-lr_fpr))

# Localizar o índice da maior média geométrica de cada curva ROC de cada modelo gerado, a prob. correspondente e a métrica AUC
    ix = np.argmax(gmeans)

    limite_prob = thresholds[ix]
    
    lista_prob_1.append(limite_prob)
   
    lista_roc_1.append(lr_auc)

 

In [91]:
## Criar uma lista em ordem decrescente em que cada elemento contém o par AUC e probabilidade de corte, gerados na célula acima,
## ordenada pela métrica AUC

lista_roc_prob_cosesul = sorted(list(zip(lista_roc_1, lista_prob_1)), reverse=True)
lista_roc_prob_cosesul[0:5]

[(0.972111515798707, 0.46421046302350877),
 (0.9694609179840201, 0.44177814925766673),
 (0.9689269846112775, 0.43878461115940987),
 (0.9679163250128716, 0.40957384032012356),
 (0.967172632100837, 0.514847677136351)]

In [92]:
### Criar um dataframe para visualizar as cinco maiores métricas AUC associadas às probabilidades de corte
data_roc_prob_cosesul = pd.DataFrame(lista_roc_prob_cosesul[0:5], columns=['ROC_AUC','Probabilidade'])
data_roc_prob_cosesul

Unnamed: 0,ROC_AUC,Probabilidade
0,0.972112,0.46421
1,0.969461,0.441778
2,0.968927,0.438785
3,0.967916,0.409574
4,0.967173,0.514848


In [93]:
# Calcular as ponderações de cada métrica AUC e as probabilidades de corte ponderadas
data_roc_prob_cosesul['Percentual_ROC'] = data_roc_prob_cosesul['ROC_AUC']/data_roc_prob_cosesul['ROC_AUC'].sum() 
data_roc_prob_cosesul['Probabibidade_Ponderada'] = data_roc_prob_cosesul['Percentual_ROC'] * data_roc_prob_cosesul["Probabilidade"].sum() 
data_roc_prob_cosesul

Unnamed: 0,ROC_AUC,Probabilidade,Percentual_ROC,Probabibidade_Ponderada
0,0.972112,0.46421,0.200618,0.455241
1,0.969461,0.441778,0.200071,0.454
2,0.968927,0.438785,0.199961,0.45375
3,0.967916,0.409574,0.199752,0.453276
4,0.967173,0.514848,0.199599,0.452928


In [94]:
# Cálculo da média das cinco probabilidades ponderadas geradas na célula anterior
ROC_AUC_Ponderada_COSESUL = data_roc_prob_cosesul['Probabibidade_Ponderada'].sum()/5
ROC_AUC_Ponderada_COSESUL

0.45383894817941195

In [95]:
### REGRESSÃO LOGÍSTICA ESTIMAÇÃO DE 999 MODELOS UTILIZANDO REAMOSTRAGEM BOOTSTRAP 
### E APLICAÇÃO AOS DADOS DE TESTE, UTILIZANDO A PROBABILIDADE DE CORTE CALCULADA NA CÉLULA ANTERIOR
### DADOS DAS REGIÕES SUL, SUDESTE E CENTRO-OESTE

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

metricas_cosesul = pd.DataFrame()


log_mcc=[]
log_bacc = []
log_sens = []
log_spec=[]
log_TP = []
log_TN = []
log_FP =[]
log_FN = []
log_acc =[]
coefs =[]
intercept=[]

#weights = {0:1, 1:45}

for i in range(999):
    
    # Realizar reamostragens
    X_train_c, y_train_c = resample(X_train_cosesul, y_train_cosesul, replace=True,  stratify=y_train_cosesul,  n_samples=len(y_train_cosesul), random_state=None)
  
  
    model_4 = LogisticRegression(solver = 'lbfgs',  multi_class = 'auto', max_iter = 10000, random_state=42, fit_intercept = True)
    
    # Gerar o modelo a cada reamostragem
    model_4.fit(X_train_c, y_train_c)

   # Predição utilizando dados de teste    
    prob = model_4.predict_proba(X_test_cosesul)
    
    # Aplicação da probabilidade de corte (classe 1 se maior ou igual que prob. corte ou classe 0 se menor que prob. corte)
     
    roc_predictions_teste = [1 if p >= 0.45383894817941195 else 0 for p in prob[:,1]]
    
    
    # Cálculo de métricas (MCC, Acurácia, Acurácia Balanceada)  
       
    log_bacc.append(metrics.balanced_accuracy_score(y_test_cosesul, roc_predictions_teste, sample_weight=None, adjusted=False))
   
    log_mcc.append(metrics.matthews_corrcoef(y_test_cosesul, roc_predictions_teste, sample_weight=None))
    
    log_acc.append(metrics.accuracy_score(y_test_cosesul, roc_predictions_teste))
    
    
    # Inserir coeficientes dos modeos em lista
    
    coefs.append(model_4.coef_[0].tolist())
    
    intercept.append(model_4.intercept_[0])
    
# Matriz de confusão para cada 
    CM = metrics.confusion_matrix(y_test_cosesul, roc_predictions_teste)

    TP = CM[0][0]
    FN = CM[0][1]
    TN = CM[1][1]
    FP = CM[1][0]

# Sensibilidade
    TPR = TP/(TP+FN)
    log_sens.append(TPR)
    
# Especificifdade
    TNR = TN/(TN+FP) 
    
    log_spec.append(TNR)
    log_TP.append(TP)
    log_FN.append(FN)
    log_TN.append(TN)
    log_FP.append(FP)

    
print('Média Acurácia Balanceada:',sum(log_bacc)/len(log_bacc))
print('Média Acurácia:',sum(log_acc)/len(log_acc))
print('Média MCC:',sum(log_mcc)/len(log_mcc))
print('Média Sensibilidade:', sum(log_sens)/len(log_sens))
print('Média Especificidade:', sum(log_spec)/len(log_spec))

metricas_cosesul['MCC'] = log_mcc
metricas_cosesul['Bal_Acc'] = log_bacc
metricas_cosesul['Accuracy'] = log_acc
metricas_cosesul['Sensitivity'] = log_sens
metricas_cosesul['Specificity'] = log_spec
metricas_cosesul['V_Pos']  = log_TP
metricas_cosesul['V_Neg'] = log_TN
metricas_cosesul['F_Pos'] = log_FP
metricas_cosesul['F_Neg'] = log_FN


coeficientes_cosesul = pd.DataFrame(coefs, columns=dados_cosesul.columns[:-1])
coeficientes_cosesul['Intercept'] = intercept

Média Acurácia Balanceada: 0.8429842379145273
Média Acurácia: 0.862962469318633
Média MCC: 0.20363179466745335
Média Sensibilidade: 0.8634010104044442
Média Especificidade: 0.8225674654246187


In [96]:
## Visualização dos coeficientes de 10 modelos gerados

coeficientes_cosesul.head(10)

Unnamed: 0,UF__GO,UF__MG,UF__MS,UF__MT,UF__PR,UF__RJ,UF__RS,UF__SC,UF__SP,Socio_emp_cont,...,Perc_serv_est_alto,Perc_serv_est_superior,Perc_serv_pop_medio,Perc_serv_pop_normal,Perc_serv_pop_alto,Perc_serv_pop_superior,dum_serv,dum_bem,dum_outros,Intercept
0,0.316999,-0.113123,0.291326,0.767362,0.556554,-0.518496,-0.506117,-1.035114,1.534934,0.019035,...,-0.029642,0.242424,-0.279996,-0.082288,0.0,0.0,2.827352,1.0863,4.112047,-2.723005
1,0.606445,0.600373,0.95747,-0.056773,0.469173,-0.648239,-0.747908,-1.182194,1.39221,-0.200181,...,0.163579,-0.289288,0.235206,0.034762,0.0,0.0,3.009743,1.198095,4.372065,-3.062548
2,-0.225384,-0.290923,0.196282,0.014039,1.278717,0.021909,-0.331187,-1.188359,1.608937,0.101893,...,0.868173,0.546286,0.01145,0.139225,0.0,0.0,3.120858,0.989048,4.001275,-3.008303
3,-0.43972,-0.515548,1.456859,0.148218,0.651079,-0.10574,-0.634682,-0.943888,1.517444,-0.138296,...,0.396876,0.581309,-0.477777,0.200637,0.205832,0.0,2.529598,0.754083,3.477116,-2.747476
4,0.249995,-0.354794,0.771252,0.664335,0.093723,0.319289,-0.299831,-1.049082,1.170852,-0.262014,...,0.119457,-0.030779,0.19988,0.31025,0.104542,0.0,2.66923,0.876316,4.491496,-2.370631
5,0.300741,0.384613,0.404034,0.025107,0.452836,0.613577,-0.136909,-1.016055,0.82588,-0.159692,...,-0.184734,0.222756,0.17373,0.52514,0.198932,0.0,2.650462,1.054187,4.119035,-3.005059
6,-0.58469,-0.063636,0.544196,0.234687,-0.182929,0.199037,-0.785871,-0.842733,1.372906,-0.448787,...,0.156888,0.366742,0.206326,-0.226515,0.083958,0.0,2.996941,1.012508,3.531905,-1.923889
7,0.026026,-0.155635,0.892119,0.242725,0.31955,0.106009,-0.635517,-1.118013,1.418002,0.002505,...,0.165824,0.067189,0.191256,0.085534,0.054498,0.0,3.190517,1.015194,4.400163,-3.088645
8,0.439789,-0.28113,0.456107,-0.001043,0.595589,0.525635,-0.479436,-0.844016,1.129498,-0.398252,...,-0.225803,-0.357568,0.734445,0.565008,0.0,0.0,2.851135,0.953013,3.981233,-2.773956
9,0.668576,-0.000612,1.073946,0.280177,0.467857,-0.367236,-0.64946,-1.025564,0.944632,-0.007436,...,0.186618,-0.069623,-0.185505,-0.296042,0.037936,0.0,2.692138,0.759608,3.745178,-2.358196


In [97]:
# Adicionando a média geométrica entre especifcicidade e sensibilidade às métricas para posterior ordenamento decrescente
metricas_cosesul['GMean_Sen_Spe'] = np.sqrt(metricas_cosesul['Sensitivity']*metricas_cosesul['Specificity'])

In [98]:
## Ordenação em ordem decrescente pela média geométrica calculada na célula anterior
metricas_ord_cosesul = metricas_cosesul.sort_values(by='GMean_Sen_Spe', ascending=False)

In [131]:
# Visualização das métricas ordenadas conforme célula anterior
metricas_ord_cosesul.head(10).round(4)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg,GMean_Sen_Spe
26,0.2291,0.8838,0.87,0.8697,0.898,7851,88,10,1176,0.8837
111,0.2224,0.8838,0.8601,0.8595,0.9082,7759,89,9,1268,0.8835
887,0.2354,0.8833,0.8789,0.8788,0.8878,7933,87,11,1094,0.8833
581,0.2272,0.8829,0.8682,0.8678,0.898,7834,88,10,1193,0.8828
127,0.2263,0.8824,0.8672,0.8668,0.898,7825,88,10,1202,0.8823
852,0.2095,0.8828,0.838,0.837,0.9286,7556,91,7,1471,0.8816
845,0.219,0.8819,0.8562,0.8557,0.9082,7724,89,9,1303,0.8815
738,0.2313,0.8813,0.8751,0.8749,0.8878,7898,87,11,1129,0.8813
368,0.2132,0.8819,0.8461,0.8454,0.9184,7631,90,8,1396,0.8811
776,0.2384,0.8809,0.8843,0.8843,0.8776,7983,86,12,1044,0.8809


In [100]:
### Verificar a significância estatística dos coeficientes dos com intervalo de confiança de 95%
## Dados do grupo Norte e Nordeste

stats_coef_cosesul = pd.DataFrame()
media = np.mean(coeficientes_cosesul)
desvio = np.std(coeficientes_cosesul)
stats_coef_cosesul['Média Coeficientes'] = media
stats_coef_cosesul['Desvio-Padrão Coeficientes'] = desvio

li_95=[]
ls_95=[]
for x in coeficientes_cosesul.columns:
    li_95.append(np.percentile(coeficientes_cosesul[x], 2.5))
    ls_95.append(np.percentile(coeficientes_cosesul[x], 97.5))
    
stats_coef_cosesul['IC_95_LInf'] =li_95
stats_coef_cosesul['IC_95_LSup'] =ls_95


stats_coef_cosesul

Unnamed: 0,Média Coeficientes,Desvio-Padrão Coeficientes,IC_95_LInf,IC_95_LSup
UF__GO,0.070573,0.375924,-0.67023,0.780425
UF__MG,-0.020985,0.308992,-0.664173,0.560403
UF__MS,0.725104,0.423549,-0.096337,1.574903
UF__MT,0.239099,0.452978,-0.515395,1.128444
UF__PR,0.526124,0.316717,-0.070556,1.195918
UF__RJ,-0.043057,0.440768,-0.904025,0.7371
UF__RS,-0.581151,0.268512,-1.126971,-0.079191
UF__SC,-1.135203,0.359518,-1.831957,-0.443866
UF__SP,1.203071,0.313808,0.593732,1.786215
Socio_emp_cont,-0.099596,0.243461,-0.65361,0.281958


## Exclusão de variávies sem significância estatística

### a) Grupo Centro-Oeste, Sul e Sudeste

In [101]:
### Exclusão de variáveis sem significância estatística para o grupo CO/SE/SUL
dados_exc_cosesul = dados_cosesul.drop(columns=['Socio_emp_cont', 'Cont_Doa','Sem_Emp_Rais','val_des_medio', 'val_des_normal', 'val_des_alto', 'val_des_superior','Perc_dom_1sm_pc_medio','Perc_dom_1sm_pc_normal', 'Perc_dom_1sm_pc_alto', 'Perc_dom_1sm_pc_superior','Perc_serv_est_medio','Perc_serv_est_normal','Perc_serv_est_alto','Perc_serv_est_superior','Perc_serv_pop_medio','Perc_serv_pop_normal','Perc_serv_pop_alto','Perc_serv_pop_superior' ])


In [102]:
## Converter os dados para formato numpy para facilitar o processamento e separação entre atributos (X) e alvo(Y):

# Sul/Sudeste/Centro-Oeste
data_numpy_exc_cosesul = dados_exc_cosesul.to_numpy()
nrow,ncol = data_numpy_exc_cosesul.shape
y_exc_cosesul = data_numpy_exc_cosesul[:,-1]
X_exc_cosesul = data_numpy_exc_cosesul[:,0:ncol-1]

In [103]:
## Dividir entre treino e teste.

from sklearn.model_selection import train_test_split

# Sul/Sudeste/Centro-Oeste
X_train_exc_cosesul, X_test_exc_cosesul, y_train_exc_cosesul, y_test_exc_cosesul = train_test_split(X_exc_cosesul, y_exc_cosesul, test_size = 0.3, stratify = y_exc_cosesul, random_state = 0)
print('Centro-Oeste, Sul e Sudeste')
print('Shape X_train:', X_train_exc_cosesul.shape)
print('Shape X_test:', X_test_exc_cosesul.shape)
print('Shape y_train:', y_train_exc_cosesul.shape)
print('Shape y_test:', y_test_exc_cosesul.shape)



Centro-Oeste, Sul e Sudeste
Shape X_train: (21289, 20)
Shape X_test: (9125, 20)
Shape y_train: (21289,)
Shape y_test: (9125,)


In [104]:
np.random.seed(42)

#### Executar o balanceamento das classes utilizando undersampling após exclusão de coeficientes
# Sul/Sudeste/Centro-Oeste

rus2 = RandomUnderSampler(sampling_strategy=1)
X_train_exc_cosesul, y_train_exc_cosesul = rus2.fit_sample(X_train_exc_cosesul, y_train_exc_cosesul)

print('X_train_under:',X_train_exc_cosesul.shape, 'y_train_under:', y_train_exc_cosesul.shape)  
print('Soma y_train_under (Classe 1):', y_train_exc_cosesul.sum())

print('X_test:', X_test_exc_cosesul.shape)
print('y_test:', y_test_exc_cosesul.shape)
print('Soma y_test (Classe 1):', y_test_exc_cosesul.sum())


X_train_under: (458, 20) y_train_under: (458,)
Soma y_train_under (Classe 1): 229.0
X_test: (9125, 20)
y_test: (9125,)
Soma y_test (Classe 1): 98.0


In [105]:
### CURVA ROC PARA DETERMINAR A PROBABILIDADE DE CORTE PARA DEFINIR CLASSIFICAÇÃO ENTRE 0 E 1 COM BOOTSTRAP 999

np.random.seed(42)

lista_probx=[]
lista_rocx=[]



for i in range(999):
    
## Realizar  reamostragem    
    X_train_d, y_train_d = resample(X_train_exc_cosesul, y_train_exc_cosesul, replace=True, stratify=y_train_exc_cosesul,  n_samples=len(y_train_exc_cosesul), random_state=None)
    

# Gerar o modelo de cada reamostargem 

    model_5 = LogisticRegression(solver='lbfgs',max_iter=10000, random_state=42, fit_intercept=True)
    model_5.fit(X_train_d, y_train_d)
    
# Predizer probabilidades

    lr_probs = model_5.predict_proba(X_train_d)
    
# Monter a probailidde da classe de interesse para calcular a métrica AUC a seguir
    lr_probs = lr_probs[:, 1]

# Calcular a métrica AUC
   
    lr_auc = roc_auc_score(y_train_d, lr_probs)


# Calcular a curvas ROC de cada modelo de regressão Logística
  
    lr_fpr, lr_tpr, thresholds = roc_curve(y_train_d, lr_probs)

    
## Calcular média geométrica entre taxa de verdedaeiros positivos e ('1 - taxa de falsos positivos) para cada threshold 
## da curva ROC
    gmeans = np.sqrt(lr_tpr * (1-lr_fpr))

# Localizar o índice da maior média geométrica de cada curva ROC de cada modelo gerado, a prob. correspondente e a métrica AUC

    ix = np.argmax(gmeans)

    limite_prob = thresholds[ix]
    
    lista_probx.append(limite_prob)
  
    lista_rocx.append(lr_auc)

In [106]:
## Criar uma lista em ordem decrescente em que cada elemento contém o par AUC e probabilidade de corte, gerados na célula acima,
## ordenada pela métrica AUC

lista_roc_prob_x = sorted(list(zip(lista_rocx, lista_probx)), reverse=True)

In [107]:
### Criar um dataframe para visualizar as cinco maiores métricas AUC associadas às probabilidades de corte
data_roc_prob_x = pd.DataFrame(lista_roc_prob_x[0:5], columns=['ROC_AUC','Probabilidade'])
data_roc_prob_x

Unnamed: 0,ROC_AUC,Probabilidade
0,0.966791,0.447539
1,0.965342,0.496286
2,0.965142,0.542499
3,0.964675,0.456233
4,0.961481,0.47439


In [108]:
# Calcular as ponderações de cada métrica AUC e as probabilidades de corte ponderadas
data_roc_prob_x['Percentual_ROC'] = data_roc_prob_x['ROC_AUC']/data_roc_prob_x['ROC_AUC'].sum() 
data_roc_prob_x['Probabibidade_Ponderada'] = data_roc_prob_x['Percentual_ROC'] * data_roc_prob_x["Probabilidade"].sum() 
data_roc_prob_x

Unnamed: 0,ROC_AUC,Probabilidade,Percentual_ROC,Probabibidade_Ponderada
0,0.966791,0.447539,0.200436,0.484444
1,0.965342,0.496286,0.200136,0.483718
2,0.965142,0.542499,0.200094,0.483618
3,0.964675,0.456233,0.199998,0.483384
4,0.961481,0.47439,0.199335,0.481783


In [133]:
# Cálculo da média das cinco probabilidades ponderadas geradas na célula anterior
ROC_AUC_Ponderada_x = data_roc_prob_x['Probabibidade_Ponderada'].sum()/5
ROC_AUC_Ponderada_x

0.4833892961160215

In [110]:
### REGRESSÃO LOGÍSTICA ESTIMAÇÃO DE 999 MODELOS UTILIZANDO REAMOSTRAGEM BOOTSTRAP 
### E APLICAÇÃO AOS DADOS DE TESTE, UTILIZANDO A PROBABILIDADE DE CORTE CALCULADA NA CÉLULA ANTERIOR
### DADOS DAS REGIÕES SUL, SUDESTE E CENTRO-OESTE

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

metricas_cosesul_x = pd.DataFrame()


log_mcc_x=[]
log_bacc_x = []
log_sens_x = []
log_spec_x=[]
log_TP_x = []
log_TN_x = []
log_FP_x =[]
log_FN_x = []
log_acc_x =[]
coefs_x =[]
intercept_x=[]

#weights = {0:1, 1:45}

for i in range(999):
    
    # Realizar reamostragens
    
    X_train_e, y_train_e = resample(X_train_exc_cosesul, y_train_exc_cosesul, replace=True, stratify=y_train_exc_cosesul,  n_samples=len(y_train_exc_cosesul), random_state=None)
  
    model_6 = LogisticRegression(solver = 'lbfgs',  multi_class = 'auto', max_iter = 10000, random_state=42, fit_intercept = True)
    
    # Gerar o modelo a cada reamostragem
    model_6.fit(X_train_e, y_train_e)

   # Predição utilizando dados de teste    
    prob = model_6.predict_proba(X_test_exc_cosesul)
    
    # Aplicação da probabilidade de corte (classe 1 se maior ou igual que prob. corte ou classe 0 se menor que prob. corte)
     
    roc_predictions_teste = [1 if p >= 0.4833892961160215 else 0 for p in prob[:,1]]
    
    
    # Cálculo de métricas (MCC, Acurácia, Acurácia Balanceada)  
       
    log_bacc_x.append(metrics.balanced_accuracy_score(y_test_exc_cosesul, roc_predictions_teste, sample_weight=None, adjusted=False))
   
    log_mcc_x.append(metrics.matthews_corrcoef(y_test_exc_cosesul, roc_predictions_teste, sample_weight=None))
    
    log_acc_x.append(metrics.accuracy_score(y_test_exc_cosesul, roc_predictions_teste))
    
    
    # Inserir coeficientes dos modelos em lista
    
    coefs_x.append(model_6.coef_[0].tolist())
    
    intercept_x.append(model_6.intercept_[0])
    
# Matriz de confusão para cada 
    CM_x = metrics.confusion_matrix(y_test_exc_cosesul, roc_predictions_teste)

    TP = CM_x[0][0]
    FN = CM_x[0][1]
    TN = CM_x[1][1]
    FP = CM_x[1][0]

# Sensibilidade
    TPR = TP/(TP+FN)
    log_sens_x.append(TPR)
    
# Especificifdade
    TNR = TN/(TN+FP) 
    
    log_spec_x.append(TNR)
    log_TP_x.append(TP)
    log_FN_x.append(FN)
    log_TN_x.append(TN)
    log_FP_x.append(FP)

    
print('Média Acurácia Balanceada:',sum(log_bacc)/len(log_bacc))
print('Média Acurácia:',sum(log_acc)/len(log_acc))
print('Média MCC:',sum(log_mcc)/len(log_mcc))
print('Média Sensibilidade:', sum(log_sens)/len(log_sens))
print('Média Especificidade:', sum(log_spec)/len(log_spec))

metricas_cosesul_x['MCC'] = log_mcc_x
metricas_cosesul_x['Bal_Acc'] = log_bacc_x
metricas_cosesul_x['Accuracy'] = log_acc_x
metricas_cosesul_x['Sensitivity'] = log_sens_x
metricas_cosesul_x['Specificity'] = log_spec_x
metricas_cosesul_x['V_Pos']  = log_TP_x
metricas_cosesul_x['V_Neg'] = log_TN_x
metricas_cosesul_x['F_Pos'] = log_FP_x
metricas_cosesul_x['F_Neg'] = log_FN_x


coeficientes_cosesul_x = pd.DataFrame(coefs_x, columns=dados_exc_cosesul.columns[:-1])
coeficientes_cosesul_x['Intercept'] = intercept_x

Média Acurácia Balanceada: 0.8429842379145273
Média Acurácia: 0.862962469318633
Média MCC: 0.20363179466745335
Média Sensibilidade: 0.8634010104044442
Média Especificidade: 0.8225674654246187


In [111]:
# Adicionando a média geométrica entre especifcicidade e sensibilidade às métricas para posterior ordenamento decrescente
metricas_cosesul_x['GMean_Sen_Spe'] = np.sqrt(metricas_cosesul_x['Sensitivity']*metricas_cosesul_x['Specificity'])

In [112]:
## Ordenação em ordem decrescente pela média geométrica calculada na célula anterior
metricas_ord_cosesul_x = metricas_cosesul_x.sort_values(by='GMean_Sen_Spe', ascending=False)

In [113]:
# Visualização das métricas ordenadas conforme célula anterior
metricas_ord_cosesul_x.head(10)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg,GMean_Sen_Spe
113,0.217433,0.887848,0.848,0.847125,0.928571,7647,91,7,1380,0.886914
474,0.22762,0.886673,0.865644,0.865182,0.908163,7810,89,9,1217,0.886412
840,0.220767,0.886402,0.855123,0.854437,0.918367,7713,90,8,1314,0.885825
111,0.22647,0.886063,0.864438,0.863964,0.908163,7799,89,9,1228,0.885788
769,0.231381,0.885005,0.872329,0.872051,0.897959,7872,88,10,1155,0.88491
866,0.238529,0.884722,0.881753,0.881688,0.887755,7959,87,11,1068,0.884716
410,0.229719,0.884174,0.870685,0.870389,0.897959,7857,88,10,1170,0.884067
26,0.236345,0.883725,0.879781,0.879694,0.887755,7941,87,11,1086,0.883715
553,0.221603,0.883405,0.859178,0.858646,0.908163,7751,89,9,1276,0.883058
12,0.226266,0.882402,0.867178,0.866844,0.897959,7825,88,10,1202,0.882264


In [114]:
### Verificar a significância estatística dos coeficientes dos com intervalo de confiança de 95%
## Dados do grupo Sul, Sudeste e Centro-Oeste

stats_coef_cosesul_x = pd.DataFrame()
media = np.mean(coeficientes_cosesul_x)
desvio = np.std(coeficientes_cosesul_x)
stats_coef_cosesul_x['Média Coeficientes'] = media
stats_coef_cosesul_x['Desvio-Padrão Coeficientes'] = desvio

li_95=[]
ls_95=[]
for x in coeficientes_cosesul_x.columns:
    li_95.append(np.percentile(coeficientes_cosesul_x[x], 2.5))
    ls_95.append(np.percentile(coeficientes_cosesul_x[x], 97.5))
    
stats_coef_cosesul_x['IC_95_LInf'] =li_95
stats_coef_cosesul_x['IC_95_LSup'] =ls_95


stats_coef_cosesul_x

Unnamed: 0,Média Coeficientes,Desvio-Padrão Coeficientes,IC_95_LInf,IC_95_LSup
UF__GO,0.274216,0.355229,-0.420025,0.945226
UF__MG,0.192637,0.282789,-0.388748,0.703149
UF__MS,0.745375,0.474983,-0.164681,1.6985
UF__MT,0.134016,0.456515,-0.63921,1.032731
UF__PR,0.301614,0.30096,-0.290847,0.92794
UF__RJ,-0.107859,0.48557,-1.062451,0.761277
UF__RS,-0.621902,0.278237,-1.191678,-0.09328
UF__SC,-1.013592,0.341074,-1.675891,-0.354908
UF__SP,1.079275,0.298245,0.479915,1.645126
val_fpm_medio,-0.137823,0.319935,-0.776801,0.49023


### b) Grupo Norte e Nordeste

In [115]:
### Exclusão de variáveis sem significância estatística para o grupo NE/NO
dados_exc_neno = dados_neno.drop(columns=['Socio_emp_cont', 'Cont_Doa', 'val_fpm_medio', 'val_fpm_normal', 'val_fpm_alto', 'val_fpm_superior','Perc_dom_1sm_pc_medio','Perc_dom_1sm_pc_normal', 'Perc_dom_1sm_pc_alto', 'Perc_dom_1sm_pc_superior'])


In [116]:
## Converter os dados para formato numpy para facilitar o processamento e separação entre atributos (X) e alvo(Y):

# Norte/Nordeste
data_numpy_exc_neno = dados_exc_neno.to_numpy()
nrow,ncol = data_numpy_exc_neno.shape
y_exc_neno = data_numpy_exc_neno[:,-1]
X_exc_neno = data_numpy_exc_neno[:,0:ncol-1]


In [117]:
## Dividir entre treino e teste.
# Norte/Nordeste

from sklearn.model_selection import train_test_split


X_train_exc_neno, X_test_exc_neno, y_train_exc_neno, y_test_exc_neno = train_test_split(X_exc_neno, y_exc_neno, test_size = 0.3, stratify = y_exc_neno, random_state = 0)
print('Norte e Nordeste')
print('Shape X_train:', X_train_exc_neno.shape)
print('Shape X_test:', X_test_exc_neno.shape)
print('Shape y_train:', y_train_exc_neno.shape)
print('Shape y_test:', y_test_exc_neno.shape)


Norte e Nordeste
Shape X_train: (6347, 35)
Shape X_test: (2721, 35)
Shape y_train: (6347,)
Shape y_test: (2721,)


In [118]:
np.random.seed(42)

#### Executar o balanceamento das classes utilizando undersampling após exclusão de coeficientes
### Norte e Nordeste

rus3 = RandomUnderSampler(sampling_strategy=1)
X_train_exc_neno, y_train_exc_neno = rus3.fit_sample(X_train_exc_neno, y_train_exc_neno)

print('X_train_under:',X_train_exc_neno.shape, 'y_train_under:', y_train_exc_neno.shape)  
print('Soma y_train_under (Classe 1):', y_train_exc_neno.sum())

print('X_test:', X_test_exc_neno.shape)
print('y_test:', y_test_exc_neno.shape)
print('Soma y_test (Classe 1):', y_test_exc_neno.sum())


X_train_under: (740, 35) y_train_under: (740,)
Soma y_train_under (Classe 1): 370.0
X_test: (2721, 35)
y_test: (2721,)
Soma y_test (Classe 1): 159.0


In [119]:
### CURVA ROC PARA DETERMINAR A PROBABILIDADE DE CORTE PARA DEFINIR CLASSIFICAÇÃO ENTRE 0 E 1 COM BOOTSTRAP 999

np.random.seed(42)

lista_probx2=[]
lista_rocx2=[]



for i in range(999):
    
## Realizar  reamostragem    
    X_train_f, y_train_f = resample(X_train_exc_neno, y_train_exc_neno, replace=True, stratify=y_train_exc_neno,  n_samples=len(y_train_exc_neno), random_state=None)
    

# Gerar o modelo de cada reamostargem 

    model_7 = LogisticRegression(solver='lbfgs', max_iter = 10000,  random_state = 42, fit_intercept = False)
    model_7.fit(X_train_f, y_train_f)
    
# Predizer probabilidades

    lr_probs = model_7.predict_proba(X_train_f)
    
# Monter a probailidde da classe de interesse para calcular a métrica AUC a seguir
    lr_probs = lr_probs[:, 1]

# Calcular a métrica AUC
   
    lr_auc = roc_auc_score(y_train_f, lr_probs)


# Calcular a curvas ROC de cada modelo de regressão Logística
  
    lr_fpr, lr_tpr, thresholds = roc_curve(y_train_f, lr_probs)

    
## Calcular média geométrica entre taxa de verdedaeiros positivos e ('1 - taxa de falsos positivos) para cada threshold 
## da curva ROC
    gmeans = np.sqrt(lr_tpr * (1-lr_fpr))

# Localizar o índice da maior média geométrica de cada curva ROC de cada modelo gerado, a prob. correspondente e a métrica AUC

    ix = np.argmax(gmeans)

    limite_prob = thresholds[ix]
    
    lista_probx2.append(limite_prob)
  
    lista_rocx2.append(lr_auc)

In [120]:
## Criar uma lista em ordem decrescente em que cada elemento contém o par AUC e probabilidade de corte, gerados na célula acima,
## ordenada pela métrica AUC

lista_roc_prob_x2 = sorted(list(zip(lista_rocx2, lista_probx2)), reverse=True)


In [121]:
### Criar um dataframe para visualizar as cinco maiores métricas AUC associadas às probabilidades de corte
data_roc_prob_x2 = pd.DataFrame(lista_roc_prob_x2[0:5], columns=['ROC_AUC','Probabilidade'])
data_roc_prob_x2

Unnamed: 0,ROC_AUC,Probabilidade
0,0.95565,0.549835
1,0.954496,0.587973
2,0.95256,0.633287
3,0.949821,0.509413
4,0.948346,0.458633


In [122]:
# Calcular as ponderações de cada métrica AUC e as probabilidades de corte ponderadas
data_roc_prob_x2['Percentual_ROC'] = data_roc_prob_x2['ROC_AUC']/data_roc_prob_x2['ROC_AUC'].sum() 
data_roc_prob_x2['Probabibidade_Ponderada'] = data_roc_prob_x2['Percentual_ROC'] * data_roc_prob_x2["Probabilidade"].sum() 
data_roc_prob_x2

Unnamed: 0,ROC_AUC,Probabilidade,Percentual_ROC,Probabibidade_Ponderada
0,0.95565,0.549835,0.20073,0.549828
1,0.954496,0.587973,0.200488,0.549164
2,0.95256,0.633287,0.200081,0.54805
3,0.949821,0.509413,0.199506,0.546474
4,0.948346,0.458633,0.199196,0.545625


In [123]:
# Cálculo da média das cinco probabilidades ponderadas geradas na célula anterior
ROC_AUC_Ponderada_x2 = data_roc_prob_x2['Probabibidade_Ponderada'].sum()/5
ROC_AUC_Ponderada_x2

0.5478282113196264

In [124]:
### REGRESSÃO LOGÍSTICA ESTIMAÇÃO DE 999 MODELOS UTILIZANDO REAMOSTRAGEM BOOTSTRAP 
### E APLICAÇÃO AOS DADOS DE TESTE, UTILIZANDO A PROBABILIDADE DE CORTE CALCULADA NA CÉLULA ANTERIOR
### DADOS DAS REGIÕES NORTE E NORDESTE

np.random.seed(42) ## Fixar seed para reprodutibilidade de resultados

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

metricas_neno_x2 = pd.DataFrame()


log_mcc_x2=[]
log_bacc_x2 = []
log_sens_x2 = []
log_spec_x2=[]
log_TP_x2 = []
log_TN_x2 = []
log_FP_x2 =[]
log_FN_x2 = []
log_acc_x2 =[]
coefs_x2 =[]
intercept_x2=[]

#weights = {0:1, 1:45}

for i in range(999):
    
    # Realizar reamostragens
    
    X_train_g, y_train_g = resample(X_train_exc_neno, y_train_exc_neno, replace=True, stratify=y_train_exc_neno,  n_samples=len(y_train_exc_neno), random_state=None)
  
    model_8 = LogisticRegression(solver = 'lbfgs',  multi_class = 'auto', max_iter = 10000, random_state=42, fit_intercept = False)
    
    # Gerar o modelo a cada reamostragem
    model_8.fit(X_train_g, y_train_g)

   # Predição utilizando dados de teste    
    prob = model_8.predict_proba(X_test_exc_neno)
    
    # Aplicação da probabilidade de corte (classe 1 se maior ou igual que prob. corte ou classe 0 se menor que prob. corte)
     
    roc_predictions_teste = [1 if p >= 0.5478282113196264 else 0 for p in prob[:,1]]
    
    
    # Cálculo de métricas (MCC, Acurácia, Acurácia Balanceada)  
       
    log_bacc_x2.append(metrics.balanced_accuracy_score(y_test_exc_neno, roc_predictions_teste, sample_weight=None, adjusted=False))
   
    log_mcc_x2.append(metrics.matthews_corrcoef(y_test_exc_neno, roc_predictions_teste, sample_weight=None))
    
    log_acc_x2.append(metrics.accuracy_score(y_test_exc_neno, roc_predictions_teste))
    
    
    # Inserir coeficientes dos modelos em lista
    
    coefs_x2.append(model_8.coef_[0].tolist())
    
    intercept_x2.append(model_8.intercept_[0])
    
# Matriz de confusão para cada 
    CM_x2 = metrics.confusion_matrix(y_test_exc_neno, roc_predictions_teste)

    TP = CM_x2[0][0]
    FN = CM_x2[0][1]
    TN = CM_x2[1][1]
    FP = CM_x2[1][0]

# Sensibilidade
    TPR = TP/(TP+FN)
    log_sens_x2.append(TPR)
    
# Especificifdade
    TNR = TN/(TN+FP) 
    
    log_spec_x2.append(TNR)
    log_TP_x2.append(TP)
    log_FN_x2.append(FN)
    log_TN_x2.append(TN)
    log_FP_x2.append(FP)

    
print('Média Acurácia Balanceada:',sum(log_bacc_x2)/len(log_bacc_x2))
print('Média Acurácia:',sum(log_acc_x2)/len(log_acc_x2))
print('Média MCC:',sum(log_mcc_x2)/len(log_mcc_x2))
print('Média Sensibilidade:', sum(log_sens_x2)/len(log_sens_x2))
print('Média Especificidade:', sum(log_spec)/len(log_spec_x2))

metricas_neno_x2['MCC'] = log_mcc_x2
metricas_neno_x2['Bal_Acc'] = log_bacc_x2
metricas_neno_x2['Accuracy'] = log_acc_x2
metricas_neno_x2['Sensitivity'] = log_sens_x2
metricas_neno_x2['Specificity'] = log_spec_x2
metricas_neno_x2['V_Pos']  = log_TP_x2
metricas_neno_x2['V_Neg'] = log_TN_x2
metricas_neno_x2['F_Pos'] = log_FP_x2
metricas_neno_x2['F_Neg'] = log_FN_x2


coeficientes_neno_x = pd.DataFrame(coefs_x2, columns=dados_exc_neno.columns[:-1])
coeficientes_neno_x['Intercept'] = intercept_x2

Média Acurácia Balanceada: 0.7877406297346494
Média Acurácia: 0.7914018391783917
Média MCC: 0.31555673277991664
Média Sensibilidade: 0.7918863437989134
Média Especificidade: 0.8225674654246187


In [125]:
# Adicionando a média geométrica entre especifcicidade e sensibilidade às métricas para posterior ordenamento decrescente
metricas_neno_x2['GMean_Sen_Spe'] = np.sqrt(metricas_neno_x2['Sensitivity']*metricas_neno_x2['Specificity'])

In [126]:
## Ordenação em ordem decrescente pela média geométrica calculada na célula anterior
metricas_ord_neno_x = metricas_neno_x2.sort_values(by='GMean_Sen_Spe', ascending=False)

In [127]:
# Visualização das métricas ordenadas conforme célula anterior
metricas_ord_neno_x.head(10)

Unnamed: 0,MCC,Bal_Acc,Accuracy,Sensitivity,Specificity,V_Pos,V_Neg,F_Pos,F_Neg,GMean_Sen_Spe
20,0.343973,0.822136,0.781698,0.776347,0.867925,1989,138,21,573,0.820859
897,0.349361,0.820508,0.795296,0.791959,0.849057,2029,135,24,533,0.820011
498,0.349361,0.820508,0.795296,0.791959,0.849057,2029,135,24,533,0.820011
216,0.357316,0.819661,0.810364,0.809133,0.830189,2073,132,27,489,0.819593
162,0.343452,0.818969,0.786843,0.782592,0.855346,2005,136,23,557,0.818161
440,0.35611,0.817492,0.811834,0.811085,0.823899,2078,131,28,484,0.817467
892,0.349538,0.817536,0.800809,0.798595,0.836478,2046,133,26,516,0.817317
103,0.347153,0.816365,0.798603,0.796253,0.836478,2040,133,26,522,0.816118
794,0.355766,0.815714,0.814039,0.813817,0.81761,2085,130,29,477,0.815711
368,0.345971,0.81578,0.797501,0.795082,0.836478,2037,133,26,525,0.815517


In [128]:
### Verificar a significância estatística dos coeficientes dos com intervalo de confiança de 95%
## Dados do grupo Norte e Nordeste
stats_coef_neno_x = pd.DataFrame()
media = np.mean(coeficientes_neno_x)
desvio = np.std(coeficientes_neno_x)
stats_coef_neno_x['Média Coeficientes'] = media
stats_coef_neno_x['Desvio-Padrão Coeficientes'] = desvio

li_95=[]
ls_95=[]
for x in coeficientes_neno_x.columns:
    li_95.append(np.percentile(coeficientes_neno_x[x], 2.5))
    ls_95.append(np.percentile(coeficientes_neno_x[x], 97.5))
    
stats_coef_neno_x['IC_95_LInf'] =li_95
stats_coef_neno_x['IC_95_LSup'] =ls_95


stats_coef_neno_x

Unnamed: 0,Média Coeficientes,Desvio-Padrão Coeficientes,IC_95_LInf,IC_95_LSup
UF__AC,0.674009,0.284297,0.151968,1.227315
UF__AM,0.928214,0.395409,0.157414,1.685216
UF__AP,1.375739,0.607653,0.011097,2.400405
UF__BA,-1.936795,0.256331,-2.45223,-1.453252
UF__CE,0.423462,0.245583,-0.074304,0.879111
UF__MA,0.373283,0.368786,-0.349517,1.111007
UF__PA,-0.177405,0.343755,-0.866464,0.509777
UF__PB,0.295445,0.283198,-0.230652,0.888417
UF__PE,0.139207,0.286551,-0.430663,0.68473
UF__PI,-1.450621,0.359555,-2.141414,-0.74125
