In [1]:
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split    
from sklearn.metrics import confusion_matrix
from sklearn import datasets

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.ensemble import RandomForestClassifier


In [2]:
# Carregar o dataset MNIST
digits = datasets.load_digits(n_class=10)

images=digits.images
targets=digits.target
images=images.reshape(len(images),8*8)

X = digits.data
y = digits.target

# Dividir o dataset em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(images, targets, test_size=0.2)

In [3]:
# Definir o grid de parâmetros para busca
param_grid = {
    'n_estimators': [100, 200, 300],        # Número de árvores na floresta
    'max_depth': [None, 10, 20, 30],        # Profundidade máxima das árvores
    'min_samples_split': [2, 5, 10],        # Número mínimo de amostras necessárias para dividir um nó
    'min_samples_leaf': [1, 2, 4],          # Número mínimo de amostras necessárias para estar em uma folha
    'max_features': ['auto', 'sqrt', 'log2'],  # Número de recursos considerados para encontrar a melhor divisão
    'criterion': ['gini', 'entropy']        # Função para medir a qualidade de uma divisão
}

# Configurar o classificador
rf = RandomForestClassifier(random_state=42)

# Configurar o GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0)

# Treinar o modelo usando GridSearchCV
grid_search.fit(X_train, y_train)

# Prever as classes para o conjunto de teste com o melhor modelo encontrado
y_pred = grid_search.best_estimator_.predict(X_test)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estim

1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parame

In [4]:
# Melhor combinação de parâmetros
print("Melhores parâmetros encontrados:", grid_search.best_params_)

# Imprimir a precisão do RandomForestClassifier com os melhores parâmetros
print("Precisão com os melhores parâmetros:", grid_search.best_score_)

acuracia = accuracy_score(y_test, y_pred)
print("Acurácia: %.2f%%" % (acuracia * 100.0))

confusion = confusion_matrix(y_test, y_pred)
print('Matriz de Confusão\n')
print(confusion)

Melhores parâmetros encontrados: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Precisão com os melhores parâmetros: 0.9763380758807589
Acurácia: 98.33%
Matriz de Confusão

[[36  0  0  0  0  0  0  0  0  0]
 [ 0 43  0  0  0  0  0  0  0  0]
 [ 0  0 31  0  0  0  0  0  0  0]
 [ 1  0  0 28  0  0  0  0  0  0]
 [ 0  0  0  0 43  0  0  0  1  0]
 [ 0  0  0  0  1 32  0  0  0  0]
 [ 0  0  0  0  0  0 29  0  0  0]
 [ 0  0  0  0  0  0  0 39  0  1]
 [ 0  0  0  0  0  1  0  0 35  0]
 [ 0  0  0  1  0  0  0  0  0 38]]
