# Eye Color Prediction Parte 3: Criação do Modelo

In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

## Setup Inicial

Nesta parte não usaremos dados em que os dados para a snp rs12913832 estejam faltando.

E aplicação do nosso modelo terá essa restrição. 

In [2]:
df = pd.read_csv('datasets/df_ml_clean_2021.csv')
# Mudança
df = df[df['rs12913832'] != "missing"]

In [3]:
# Conjunto com as SNPs
snp_list_6 = ['rs12913832',
    'rs1800407',
    'rs12896399',
    'rs16891982',
    'rs1393350',
    'rs12203592']

snp_list_13 = [
    'rs1129038', 
    'rs11636232', 
    'rs12203592',
    'rs12896399',
    'rs12913832', 
    'rs1393350',
    'rs1667394',
    'rs16891982',
    'rs1800407',
    'rs4778232',
    'rs4778241',
    'rs7183877',
    'rs8024968']
snp_list_remove = list(set(snp_list_13) - set(snp_list_6))

In [4]:
df = df.drop(columns=snp_list_remove)

In [5]:
df.head(2)

Unnamed: 0,rs12203592,rs12896399,rs12913832,rs1393350,rs16891982,rs1800407,color_cat
2,CC,GG,GG,GG,missing,CC,Brown
3,CC,TG,GG,GG,GG,CC,BGG


In [6]:
df.shape

(1081, 7)

In [7]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
X_train.nunique()

rs12203592    3
rs12896399    4
rs12913832    3
rs1393350     3
rs16891982    4
rs1800407     4
dtype: int64

In [9]:
# encoding
ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train)

X_train_encoded = ohe.transform(X_train)
X_test_encoded = ohe.transform(X_test)



### Experimento 01: Regressão Logística

In [10]:
space01 = {"C":np.logspace(-4, 4, 50),
            "fit_intercept":[True,False]
            }

model01 = LogisticRegression(multi_class='multinomial', max_iter=1000 )
cv01 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search01 = GridSearchCV(model01, space01, cv=cv01, n_jobs=-1, verbose=10)
result01 = search01.fit(X_train_encoded, y_train)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits
[CV 7/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 4/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 1/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 2/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 3/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 6/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 5/15; 1/100] START C=0.0001, fit_intercept=True.............................
[CV 1/15; 1/100] END C=0.0001, fit_intercept=True;, score=0.370 total time=   0.0s
[CV 3/15; 1/100] END C=0.0001, fit_intercept=True;, score=0.370 total time=   0.0s
[CV 5/15; 1/100] END C=0.0001, fit_intercept=True;, score=0.372 total time=   0.0s
[CV 7/15; 1/100] END C=0.0001, fit_intercept=True;, score=0.370 total time=   0.0s
[CV 6/15; 1/100] END C=0.0001, fit_i

In [11]:
print(f"Os melhores parâmetros encontrados foram: {result01.best_params_}")
best_model01 = result01.best_estimator_

y_pred = best_model01.predict(X_test_encoded)
y_pred_probs = best_model01.predict_proba(X_test_encoded)
y_pred_train = best_model01.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'C': 0.18420699693267145, 'fit_intercept': True}
Roc auc score do modelo: 0.8586474750865453
Acurácia de teste: 72.81%.
Acurácia de treino: 75.35%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.73      0.92      0.81        83
         BGG       0.75      0.93      0.83        74
         Int       0.62      0.22      0.32        60

    accuracy                           0.73       217
   macro avg       0.70      0.69      0.66       217
weighted avg       0.71      0.73      0.68       217



### Experimento 02: Random Forest

In [12]:
n_estimators = [120, 300, 500, 800]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [5, 8, 15, 25, None]
min_samples_split = [1, 2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

space02 = {'n_estimators': n_estimators,
           'max_features': max_features,
           'max_depth': max_depth,
           'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf,
           } 

model02 = RandomForestClassifier(random_state=42)
cv02 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search02 = RandomizedSearchCV(model02, space02, cv=cv02, n_jobs=-1, n_iter=30, verbose=10)
result02 = search02.fit(X_train_encoded, y_train)

Fitting 15 folds for each of 30 candidates, totalling 450 fits
[CV 2/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 3/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 1/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 4/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 5/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 6/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 7/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[CV 8/15; 1/30] START max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800
[

  warn(


[CV 1/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.728 total time=   1.4s
[CV 2/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500


  warn(


[CV 9/15; 1/30] END max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800;, score=0.763 total time=   2.1s
[CV 3/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500
[CV 11/15; 1/30] END max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800;, score=0.746 total time=   2.2s
[CV 4/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500
[CV 10/15; 1/30] END max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800;, score=0.733 total time=   2.2s
[CV 5/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500
[CV 13/15; 1/30] END max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=15, n_estimators=800;, score=0.699 total time=   2.1s
[CV 12/15; 1/30] END max_depth=None, max_features=log2, min_sam

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 2/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.751 total time=   1.3s
[CV 10/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500


  warn(


[CV 3/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.705 total time=   1.3s
[CV 11/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500
[CV 4/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.728 total time=   1.3s
[CV 12/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500
[CV 5/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.715 total time=   1.4s
[CV 13/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500
[CV 6/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.751 total time=   1.4s
[CV 14/15; 2/30] START max_depth=5, max_features=auto, min_samples_leaf=1, 

  warn(
  warn(
  warn(
  warn(
  warn(


[CV 10/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.727 total time=   1.5s
[CV 3/15; 3/30] START max_depth=15, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 11/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.723 total time=   1.5s
[CV 4/15; 3/30] START max_depth=15, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 12/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.751 total time=   1.5s
[CV 5/15; 3/30] START max_depth=15, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 14/15; 2/30] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=100, n_estimators=500;, score=0.717 total time=   1.5s
[CV 6/15; 3/30] START max_depth=15, max_features=sqrt, min_samples_leaf=10

  warn(


[CV 9/15; 7/30] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=300;, score=0.763 total time=   0.8s
[CV 2/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500
[CV 10/15; 7/30] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=300;, score=0.727 total time=   0.8s
[CV 3/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500
[CV 11/15; 7/30] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=300;, score=0.751 total time=   0.8s
[CV 4/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500
[CV 12/15; 7/30] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=300;, score=0.763 total time=   0.8s
[CV 5/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 1/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.734 total time=   1.5s
[CV 9/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500


  warn(


[CV 3/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.717 total time=   1.4s
[CV 10/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500
[CV 2/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.763 total time=   1.4s
[CV 11/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500
[CV 4/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.734 total time=   1.4s
[CV 12/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500
[CV 5/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.733 total time=   1.4s
[CV 13/15; 8/30] START max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_sp

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 1/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.728 total time=   0.4s
[CV 2/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120


  warn(


[CV 9/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.769 total time=   1.5s
[CV 3/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 2/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.746 total time=   0.4s
[CV 4/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120


  warn(
  warn(


[CV 3/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.734 total time=   0.4s
[CV 5/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 4/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.740 total time=   0.4s
[CV 6/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 10/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.744 total time=   1.5s


  warn(
  warn(
  warn(
  warn(


[CV 7/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 11/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.751 total time=   1.5s
[CV 8/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 13/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.694 total time=   1.5s
[CV 12/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimators=500;, score=0.775 total time=   1.5s
[CV 9/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 10/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 14/15; 8/30] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=1, n_estimat

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 6/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.757 total time=   0.4s
[CV 14/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 7/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.734 total time=   0.5s
[CV 15/15; 9/30] START max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120
[CV 8/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.717 total time=   0.5s
[CV 1/15; 10/30] START max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300
[CV 9/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.769 total time=   0.5s
[CV 2/15; 10/30] START max_depth=8, max_features=sqrt, min_sample

  warn(


[CV 15/15; 9/30] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=120;, score=0.709 total time=   0.4s
[CV 8/15; 10/30] START max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300
[CV 1/15; 10/30] END max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300;, score=0.723 total time=   0.9s
[CV 9/15; 10/30] START max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300
[CV 2/15; 10/30] END max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300;, score=0.757 total time=   0.8s
[CV 10/15; 10/30] START max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300
[CV 3/15; 10/30] END max_depth=8, max_features=sqrt, min_samples_leaf=5, min_samples_split=1, n_estimators=300;, score=0.728 total time=   0.9s
[CV 11/15; 10/30] START max_depth=8, max_features=sqrt, min_samples_leaf=5, min_

  warn(
  warn(


[CV 9/15; 11/30] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=300;, score=0.763 total time=   0.9s
[CV 2/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300
[CV 10/15; 11/30] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=300;, score=0.709 total time=   0.9s
[CV 3/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300
[CV 11/15; 11/30] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=300;, score=0.763 total time=   0.9s
[CV 4/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300
[CV 12/15; 11/30] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=300;, score=0.746 total time=   0.9s
[CV 5/15; 12/30] START max_depth=None, max_features=auto, min_samp

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 2/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.757 total time=   0.8s
[CV 10/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300


  warn(


[CV 3/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.723 total time=   0.9s
[CV 11/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300
[CV 4/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.728 total time=   0.9s
[CV 12/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300
[CV 5/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.738 total time=   0.9s
[CV 13/15; 12/30] START max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300
[CV 6/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.751 total time=   0.9s
[CV 14/15; 12/30] START max_depth=None, max_features=a

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 10/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.727 total time=   0.9s
[CV 3/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800


  warn(


[CV 11/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.751 total time=   0.9s
[CV 4/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 12/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.763 total time=   0.9s
[CV 5/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 13/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.694 total time=   0.9s
[CV 6/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 14/15; 12/30] END max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=300;, score=0.746 total time=   0.9s
[CV 7/15; 13/30] START max_depth=5, max_features=auto, min_

  warn(
  warn(
  warn(
  warn(
  warn(


[CV 2/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.757 total time=   2.3s
[CV 9/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 1/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.740 total time=   2.3s
[CV 10/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 3/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.711 total time=   2.3s
[CV 11/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800


  warn(
  warn(
  warn(


[CV 4/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.728 total time=   2.3s
[CV 12/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 5/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.733 total time=   2.3s
[CV 13/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 6/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.751 total time=   2.3s
[CV 14/15; 13/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800
[CV 7/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.711 total time=   2.2s
[CV 15/15; 13/30] START max_depth=5, max_features=auto, min_samples_

  warn(
  warn(
  warn(
  warn(
  warn(


[CV 9/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.763 total time=   2.4s
[CV 2/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 10/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.733 total time=   2.5s
[CV 3/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800


  warn(
  warn(


[CV 11/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.734 total time=   2.5s
[CV 4/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800


  warn(


[CV 12/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.763 total time=   2.5s
[CV 5/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 13/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.699 total time=   2.4s
[CV 6/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 15/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.709 total time=   2.4s
[CV 7/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 14/15; 13/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=800;, score=0.746 total time=   2.5s
[CV 8/15; 14/30] START max_depth=15, max_features=auto, min_s

  warn(
  warn(
  warn(
  warn(
  warn(


[CV 2/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.740 total time=   2.4s
[CV 10/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 3/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.705 total time=   2.4s
[CV 11/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800


  warn(
  warn(


[CV 4/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.723 total time=   2.4s
[CV 12/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800


  warn(


[CV 6/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.751 total time=   2.4s
[CV 13/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 5/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.721 total time=   2.4s
[CV 14/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 8/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.717 total time=   2.4s
[CV 15/15; 14/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800
[CV 7/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.711 total time=   2.5s
[CV 1/15; 15/30] START max_depth=5, max_features=log2,

  warn(
  warn(
  warn(


[CV 9/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.740 total time=   2.5s
[CV 2/15; 15/30] START max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=500
[CV 1/15; 15/30] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.740 total time=   1.6s
[CV 3/15; 15/30] START max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=500
[CV 10/15; 14/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=800;, score=0.727 total time=   2.5s
[CV 4/15; 15/30] START max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=500
[CV 2/15; 15/30] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.757 total time=   1.5s
[CV 5/15; 15/30] START max_depth=5, max_features=log2, min_samples_leaf=5, m

  warn(


[CV 9/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.769 total time=   1.1s
[CV 2/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 1/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.751 total time=   0.5s
[CV 3/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 10/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.715 total time=   1.1s
[CV 4/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 11/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.780 total time=   1.1s
[CV 5/15; 21/30] START max_depth=25, max_features=auto, min_sampl

  warn(
  warn(
  warn(
  warn(
  warn(


[CV 12/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.757 total time=   1.1s
[CV 6/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 2/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.757 total time=   0.6s
[CV 7/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 13/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.688 total time=   1.2s
[CV 8/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 3/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.728 total time=   0.6s
[CV 9/15; 21/30] START max_depth=25, max_features=auto, min_samp

  warn(
  warn(
  warn(


[CV 4/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.728 total time=   0.7s
[CV 10/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 5/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.738 total time=   0.7s
[CV 11/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 6/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.751 total time=   0.7s
[CV 12/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 14/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.751 total time=   1.3s
[CV 13/15; 21/30] START max_depth=25, max_features=auto, min_

  warn(
  warn(
  warn(
  warn(


[CV 15/15; 20/30] END max_depth=25, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=300;, score=0.709 total time=   1.4s
[CV 14/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 7/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.717 total time=   0.6s
[CV 15/15; 21/30] START max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120
[CV 8/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.728 total time=   0.6s
[CV 1/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500


  warn(
  warn(
  warn(


[CV 9/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.763 total time=   0.6s
[CV 2/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 10/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.733 total time=   0.5s
[CV 3/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 11/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.717 total time=   0.5s
[CV 4/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 12/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.763 total time=   0.5s
[CV 5/15; 22/30] START max_depth=5, max_features=auto, min_samples_

  warn(
  warn(
  warn(
  warn(


[CV 13/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.699 total time=   0.5s
[CV 6/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 14/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.746 total time=   0.5s
[CV 7/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 15/15; 21/30] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=120;, score=0.709 total time=   0.5s
[CV 8/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500


  warn(
  warn(
  warn(


[CV 1/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.746 total time=   1.8s
[CV 9/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 2/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.757 total time=   1.8s
[CV 10/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 3/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.711 total time=   1.7s
[CV 11/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 4/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.728 total time=   1.7s
[CV 12/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10,

  warn(
  warn(
  warn(
  warn(
  warn(


[CV 6/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.751 total time=   1.8s
[CV 14/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 7/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.711 total time=   1.8s
[CV 15/15; 22/30] START max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 8/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.728 total time=   1.7s
[CV 1/15; 23/30] START max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120


  warn(
  warn(


[CV 1/15; 23/30] END max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120;, score=0.711 total time=   0.4s
[CV 2/15; 23/30] START max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120
[CV 2/15; 23/30] END max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120;, score=0.746 total time=   0.4s
[CV 3/15; 23/30] START max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120
[CV 3/15; 23/30] END max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120;, score=0.728 total time=   0.4s
[CV 4/15; 23/30] START max_depth=8, max_features=log2, min_samples_leaf=1, min_samples_split=1, n_estimators=120
[CV 9/15; 22/30] END max_depth=5, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.763 total time=   1.6s
[CV 5/15; 23/30] START max_depth=8, max_features=log2, min_samples_leaf=1, min_sampl

  warn(
  warn(


[CV 10/15; 29/30] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=800;, score=0.727 total time=   2.6s
[CV 3/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500


  warn(


[CV 1/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.746 total time=   1.7s
[CV 4/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 2/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.757 total time=   1.7s
[CV 5/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 11/15; 29/30] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=800;, score=0.717 total time=   2.5s
[CV 6/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 12/15; 29/30] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=800;, score=0.751 total time=   2.5s
[CV 7/15; 30/30] START max_depth=15, max_features=auto, min_samples_l

  warn(
  warn(
  warn(
  warn(


[CV 13/15; 29/30] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=800;, score=0.711 total time=   2.5s
[CV 8/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 14/15; 29/30] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=800;, score=0.746 total time=   2.5s
[CV 9/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500


  warn(
  warn(


[CV 15/15; 29/30] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=800;, score=0.703 total time=   2.5s
[CV 10/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500


  warn(


[CV 3/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.711 total time=   1.7s
[CV 11/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500


  warn(


[CV 4/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.728 total time=   1.7s
[CV 12/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 5/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.738 total time=   1.7s
[CV 13/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 6/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.751 total time=   1.7s
[CV 14/15; 30/30] START max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500
[CV 7/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.711 total time=   1.7s
[CV 15/15; 30/30] START max_depth=15, max_features=auto, min_samples

  warn(
  warn(
  warn(
  warn(


[CV 8/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.728 total time=   1.8s
[CV 9/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.763 total time=   1.7s
[CV 10/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.727 total time=   1.7s
[CV 11/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.751 total time=   1.6s
[CV 12/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.763 total time=   1.5s
[CV 13/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, score=0.699 total time=   1.4s
[CV 14/15; 30/30] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=500;, scor

In [13]:
print(f"Os melhores parâmetros encontrados foram: {result02.best_params_}")
best_model02 = result02.best_estimator_

y_pred = best_model02.predict(X_test_encoded)
y_pred_probs = best_model02.predict_proba(X_test_encoded)
y_pred_train = best_model02.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'n_estimators': 120, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5}
Roc auc score do modelo: 0.8661341050934076
Acurácia de teste: 71.89%.
Acurácia de treino: 75.58%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.73      0.92      0.81        83
         BGG       0.74      0.92      0.82        74
         Int       0.57      0.20      0.30        60

    accuracy                           0.72       217
   macro avg       0.68      0.68      0.64       217
weighted avg       0.69      0.72      0.67       217



### Experimento 03: XGBoosting

In [12]:
y_train_map = y_train.replace({"BGG":0, "Brown":1, "Int":2})
y_test_map = y_test.replace({"BGG":0, "Brown":1, "Int":2})


space03 = {'eta': [0.01, 0.015, 0.025, 0.05, 0.1],
           'gamma': [0.05,0.01,0.3,0.5,0.7,0.9,1.0],
           'max_depth': [3,5,7,9,12,15,17,25],
           'min_child_weight': [1,3,5,7],
           'subsample': [0.6,0.7,0.8,0.9,1.0],
           'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
           'lambda':[0.01,0.1,1.0],
           'alpha':[0,0.1,0.5,1.0],
           } 

model03 = xgb.XGBClassifier(objective="multi:softproba", num_class=3, random_state=42, verbosity=3)
cv03 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search03 = RandomizedSearchCV(model03, space03, cv=cv03, n_jobs=-1, n_iter=500, verbose=10)
result03 = search03.fit(X_train_encoded, y_train_map)

Fitting 15 folds for each of 500 candidates, totalling 7500 fits
[15:05:48] DEBUG: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/gbm/gbtree.cc:155: Using tree method: 2
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 8 extra nodes, 6 pruned nodes, max_depth=3
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 10 extra nodes, 4 pruned nodes, max_depth=4
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 14 pruned nodes, max_depth=4
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 8 extra nodes, 6 pruned nodes, max_depth=3
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 12

In [13]:
print(f"Os melhores parâmetros encontrados foram: {result03.best_params_} \n")
best_model03 = result03.best_estimator_

y_pred = best_model03.predict(X_test_encoded)
y_pred_probs = best_model03.predict_proba(X_test_encoded)
y_pred_train = best_model03.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test_map, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test_map, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train_map):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test_map, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'subsample': 0.7, 'min_child_weight': 7, 'max_depth': 7, 'lambda': 1.0, 'gamma': 1.0, 'eta': 0.05, 'colsample_bytree': 1.0, 'alpha': 0} 

Roc auc score do modelo: 0.8948228691843373
Acurácia de teste: 78.03%.
Acurácia de treino: 81.69%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.87      0.89      0.88        97
         BGG       0.76      0.92      0.83        86
         Int       0.45      0.23      0.30        40

    accuracy                           0.78       223
   macro avg       0.69      0.68      0.67       223
weighted avg       0.75      0.78      0.76       223



## Salvando o Modelo

In [14]:
best_model = best_model01

In [18]:
pipe = Pipeline([
    ('ohe', ohe), 
    ('model', best_model)
    ])

In [19]:
filename = 'app_streamlit/best_model.sav'
pickle.dump(pipe, open(filename, 'wb'))

In [21]:
pipe.classes_

array(['BGG', 'Brown', 'Int'], dtype=object)