# Model Tuning no XGBoost

## Projeto de Mapeamento Marinho Apresentado Nas Jornadas de Engenharia Hidrográfica

## Diogo Ceddia Porto Silva

#### Importação de bibliotecas

In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import time
import numpy as np

#### Carregando dado

In [2]:
dado_linhas = pd.read_csv("C:\DCPS\GitHub\Dados_MapeamentoIA_JEH\dado_linhas.txt")

#### Separando target/feature e treino/validação

In [3]:
features = ['z','bs','sl','as','dist_lc','dist_ds']
target = ['classe']

X = dado_linhas[features]
y = dado_linhas[target]

X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X,columns=features)

train_size=0.2
test_size=0.8
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0,
                                                        train_size=train_size, 
                                                        test_size=test_size, 
                                                        stratify=y.classe)

#### Definição do modelo

In [4]:
model = XGBClassifier()

#### Treinamento Preliminar

In [5]:
start = time.time()
model.fit(X_train, y_train.values.ravel(), eval_metric='merror')
y_pred = model.predict(X_valid)
end=time.time()
y_pred = pd.DataFrame(y_pred,columns=['classe'])

print('Tempo de treinamento',end-start,'segundos')
print(f'{train_size*100}% para treinar')
print(f'{test_size*100}% para validar')

# Métrica de acurácia considerando o dado desbalanceado
precisao = metrics.balanced_accuracy_score(y_valid, y_pred)
print(f'\nA precisão do modelo é de {(precisao*100).round(3)}%')

Tempo de treinamento 3.1112594604492188 segundos
20.0% para treinar
80.0% para validar

A precisão do modelo é de 98.876%


###### Todos os parâmetros possíveis

#### Por tentativa e erro...

In [6]:
learning_rate = np.arange(0.61,  0.63, 0.01).tolist()   #aka eta
max_depth     = np.arange(6   ,    9,    1).tolist()
n_estimators  = np.arange(1300  ,  1330,  10).tolist()

In [7]:
print('learning_rate:',learning_rate)
print('max_depth:',max_depth)
print('n_estimators:',n_estimators)

learning_rate: [0.61, 0.62, 0.63]
max_depth: [6, 7, 8]
n_estimators: [1300, 1310, 1320]


In [8]:
qtd_fits = len(learning_rate)*len(max_depth)*len(n_estimators)
print(qtd_fits)

27


#### Tuning

In [9]:
params = {
            'learning_rate': learning_rate,   # aka 'eta' 
            'max_depth': max_depth, 
            'n_estimators': n_estimators,
            #'max_depth': [8],
            #'learning_rate': [0.61],
            #'n_estimators': [1310],
            #'subsample': np.arange(0.8, 1.2, 0.1).tolist(),  
            #'colsample_bytree': np.arange(0.4, 1.6, 0.2).tolist(),
        }

In [10]:
grid_search = GridSearchCV(model, param_grid=params,cv=5, n_jobs=-1, scoring='balanced_accuracy')
grid_search.fit(X_train, y_train, eval_metric='merror')

print(grid_search.best_params_)
print(np.round(100*grid_search.score(X, y),5),'%')

{'learning_rate': 0.62, 'max_depth': 7, 'n_estimators': 1310}
99.47097 %
