In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from math import log
import matplotlib.pyplot as plt
from scipy.special import rel_entr

from sklearn import datasets
from sklearn.datasets import make_regression

# Pre-processamento
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

# Classificadores
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Modelos
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

# Metricas
# Outras métricas https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
#
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,fbeta_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import ConfusionMatrixDisplay

# [Exemplo](https://acervolima.com/divergencia-kullback-leibler/)

Imagine duas urnas que contém 4 tipos de bolas (verde, azul, vermelha, amarela). Uma bola é retirada aleatoriamente de uma das urnas com as probabilidades fornecidas. O objetivo é calcular a diferença de distribuições de duas urnas.

In [None]:
# box =[P(verde),P(Azul),P(Vermelha),P(Amarela)]
urna_1 = [0.25, 0.33, 0.23, 0.19]
urna_2 = [0.21, 0.21, 0.32, 0.26]
  

#----------------------------------------------
# Usando a função rel_entr do pacote Scipy
#----------------------------------------------

box_1 = np.array(box_1)
box_2 = np.array(box_2)
  
print('Divergência_KL(urna_1 || urna_2): %.3f' % sum(rel_entr(urna_1,urna_2)))
print('Divergência_KL(urna_2 || urna_1): %.3f' % sum(rel_entr(urna_2,urna_1)))
print('Divergência_KL(urna_1 || urna_1): %.3f' % sum(rel_entr(urna_1,urna_1)))


In [None]:
# import sys
# !{sys.executable} -m pip install matplotlib

In [None]:
# ---------------------------------
# Exemplo com dados artificiais
# ---------------------------------

# calculando o AIC
def calculate_aic(n, mse, num_params):
 aic = n * log(mse) + 2 * num_params
 return aic

# calculando o BIC
def calculate_bic(n, mse, num_params):
 bic = n * log(mse) + num_params * log(n)
 return bic

# Gerando valores para uma regressão
X, y = make_regression(n_samples=100, n_features=2, noise=0.1)

In [None]:
# define and fit the model on all data
model = LinearRegression()
model.fit(X, y)

In [None]:
# numbero de parametros
num_params = len(model.coef_) + 1
print('Número de parametros: %d' % (num_params))

In [None]:
# ------------------------------
# Calculando valores ajustados
yhat = model.predict(X)
# -----------------------------

# --------------------------
# calculando MSE
# --------------------------
mse = mean_squared_error(y, yhat)
print('MSE: %.3f' % mse)

In [None]:
# --------------------------
# calculando AIC
# --------------------------
aic = calculate_aic(len(y), mse, num_params)
print('AIC: %.3f' % aic)

In [None]:
# --------------------------
# calculando BIC
# --------------------------
bic = calculate_bic(len(y), mse, num_params)
print('BIC: %.3f' % bic)

In [None]:
# Exemplo
# O German Credit Data é um dataset sobre análise de risco de crédito bancário
# serve para classificar se uma pessoa tem risco alto ou baixo com base em uma 
# série de atributos.

dataset = pd.read_csv('dataset/credit-g.csv')
dataset_dummies = pd.get_dummies(dataset, columns=['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 
                                'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing',
                                'job', 'own_telephone', 'foreign_worker'], drop_first=True)

label_encoder = LabelEncoder()
dataset_dummies['class'] = label_encoder.fit_transform(dataset_dummies['class'])
X = dataset_dummies.drop(axis=1, columns=['class'])
y = dataset_dummies[['class']]

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = y.values.ravel()

model = MLPClassifier(hidden_layer_sizes=(25, 25), max_iter=200, random_state=42)
model.fit(X, y)
print("Acurácia: ", model.score(X, y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = MLPClassifier(hidden_layer_sizes=(25, 25), max_iter=200, random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)

### Outras métricas https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

The common metrics provided by sklearn are passable as a string into this parameter, where some typical choices would be:

    ‘accuracy’
    ‘balanced_accuracy’
    ‘roc_auc’
    ‘f1’
    ‘neg_mean_absolute_error’
    ‘neg_root_mean_squared_error’
    ‘r2’

In [None]:
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(scores)

In [None]:
y_pred = model.predict(X_test)

mc = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(confusion_matrix=mc, display_labels=['Crédito Ruim', 'Crédito Bom']).plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
rf_prob = model.predict_proba(X_test)
probs = rf_prob[:, 1]
rfp, rvp,lim = roc_curve(y_test, probs)

# gráfico da curva roc
plt.plot(rfp, rvp, marker='.', label='Random Forest',color="red")
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

# axis labels
plt.xlabel('1 - Especificidade')
plt.ylabel('Sensibilidade')
plt.legend()
plt.show()

print('ROC AUC: %.6f' % roc_auc_score(y_test, y_pred))

In [None]:
# ----------------------------
# Dados de câncer de mama
# ----------------------------
bc = datasets.load_breast_cancer()
X = bc.data
y = bc.target

# ----------------------------------------
# Data splitting: Dados de treino e teste
# ----------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
print('Score hold-out:', model.score(X_test, y_test))

y_pred = model.predict(X_test)

conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)

ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Maligno', 'Benigno']).plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
rf_prob = model.predict_proba(X_test)
probs = rf_prob[:, 1]
rfp, rvp,lim = roc_curve(y_test, probs)

# gráfico da curva roc
plt.plot(rfp, rvp, marker='.', label='Random Forest',color="red")
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

# axis labels
plt.xlabel('1 - Especificidade')
plt.ylabel('Sensibilidade')
plt.legend()
plt.show()

print('ROC AUC: %.6f' % roc_auc_score(y_test, y_pred))

In [None]:
# validação cruzada k-fold

model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X, y, cv=10)

print('Scores 10-fold cross-validation:', scores)
print('\nMédia 10-fold cross-validation:', scores.mean())
print('\nDesvio padrão 10-fold cross-validation:', scores.std())

In [None]:
def cross_validation(model, _X, _y, _cv=10):
      '''
      ----------------------------------------------------
      Função para executar a validação cruzada k-fold
      ----------------------------------------------------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           matriz de covariáveis
      _y: array
           Variável dependente
      _cv: int, default=5
          Número de lotes na validação cruzada
       Returns
       -------
       Retorna as métricas para avaliação de desempenho do modelo
       'acurácia', 'precisão', 'recall', 'f1'
       tanto para o conjunto de treino quanto para o conjunto de teste.
      '''
      metricas = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=metricas,
                               return_train_score=True)
      
      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

In [None]:
result = cross_validation(model, X, y)

from pprint import pprint
pprint(result)

In [None]:
# Outro exemplo

def KFold_vc(x_axis, y_axis):

    kfold  = KFold(n_splits=10, shuffle=True) # shuffle=True, (embaralhar)

    # Axis
    x = x_axis
    y = y_axis

    linearRegression = LinearRegression()
    elasticNet = ElasticNet()
    ridge = Ridge()
    lasso = Lasso()

    #----------------------------------
    # Modelos contemplados na análise
    #----------------------------------
    linearRegression_result = cross_val_score(linearRegression, x, y, cv = kfold, scoring="neg_root_mean_squared_error")
    elasticNet_result       = cross_val_score(elasticNet, x, y, cv = kfold, scoring="neg_root_mean_squared_error")
    ridge_result            = cross_val_score(ridge, x, y, cv = kfold, scoring="neg_root_mean_squared_error")
    lasso_result            = cross_val_score(lasso, x, y, cv = kfold, scoring="neg_root_mean_squared_error")

    dic_models = {
    "Regressão Linear": linearRegression_result.mean(),
    "ElasticNet": elasticNet_result.mean(),
    "Regressão Ridge": ridge_result.mean(),
    "Lasso": lasso_result.mean()
    }
    
    #----------------------------------
    # Selecionando o modelo
    #----------------------------------
    bestModel = max(dic_models, key=dic_models.get)

    print("Regressão Linear: {0}\nElastic Net: {1}\nRegressão Ridge: {2}\nLasso: {3}".format(round(linearRegression_result.mean(),4), round(elasticNet_result.mean(),4), round(ridge_result.mean(),4), round(lasso_result.mean(),4)))
    print("\nModelo com melhor desempenho: {0}".format(bestModel))


df = pd.read_csv("dataset/Admission_Predict.csv")
df.drop('Serial No.', axis = 1, inplace = True)

X = df.drop('Chance of Admit ', axis = 1)
y = df['Chance of Admit ']

KFold_vc(X, y)