# XGBoost

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score

#### Read data

In [None]:
df = pd.read_csv('../../../datasets/parte2/treino/dataset_prepared.csv', na_filter=False)

In [None]:
replace_map = {'None':0, 'Low':1, 'Medium':2, 'High':3, 'Very High':4}

df['injection'] = df['injection'].replace(replace_map).astype(int)

#### X and y arrays

In [None]:
X = df.drop(['injection'], axis=1)
y = df['injection']

#### Train Test Split

Now let's split the data into a training set and a testing set. We will train out model on the training set and then use the test set to evaluate the model.

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023)

#### Training 

Using GridSearchCV to find the best hyperparameters

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
}

model = XGBClassifier(random_state=2023)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, verbose=1, cv=5, n_jobs=-1) # scoring='f1'
grid_search.fit(X, y)

Inspect the best parameters

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

Get the best estimator

In [None]:
model = grid_search.best_estimator_

Get the predictions using the trained model

In [None]:
#predictions = model.predict(X_test)

#### Model Evaluation

Classification report

In [None]:
#print(classification_report(y_test, predictions))

Accuracy Score

In [None]:
#accuracy_score(y_test, predictions)

Confusion Matrix

In [None]:
#ConfusionMatrixDisplay.from_predictions(y_test, predictions)
#plt.show()

In [None]:
df_test = pd.read_csv('../../../datasets/parte2/teste/dataset_prepared.csv')
predictions_teste = model.predict(df_test)


In [None]:
df_predictions = pd.DataFrame({'Result': predictions_teste})
df_predictions['RowId'] = range(1, len(predictions_teste) + 1)
df_predictions = df_predictions[['RowId', 'Result']]

replace_map = { 0:'None', 1:'Low', 2:'Medium', 3:'High', 4:'Very High'} 

df_predictions['Result'] = df_predictions['Result'].replace(replace_map)

df_predictions.to_csv('../../../datasets/parte2/teste/kaggle-xgboost.csv', index=False)