# XGBoost

#### Import libraries

In [479]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score, make_scorer, f1_score

#### Read data

In [480]:
df = pd.read_csv('../../../datasets/parte2/train_prepared.csv', na_filter=False)

#### Turn categorical features into numeric

In [481]:
replace_map = {'None':0, 'Low':1, 'Medium':2, 'High':3, 'Very High':4}

df['injection'] = df['injection'].replace(replace_map).astype(int)

#### X and y arrays

In [482]:
X = df.drop(['injection'], axis=1)
y = df['injection']

#### Training 

##### Grid Search

Using GridSearchCV to find the best hyperparameters

In [483]:
"""
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4],
    #'min_child_weight': [5, 6, 7],
    #'colsample_bytree': [0.7, 0.8, 0.9], # 0 ou 0,2 ou 0.7
    #'subsample': [0.7, 0.8, 0.9],
    #'gamma': [0.5, 1, 1.5, 2, 5],
}

model = XGBClassifier(random_state=2023)

#f1_scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, cv=15, verbose=0, n_jobs=-1, error_score="raise") # scoring='f1'
grid_search.fit(X_train, y_train)
"""

'\nparam_grid = {\n    \'learning_rate\': [0.01, 0.1, 0.2],\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [3, 4],\n    #\'min_child_weight\': [5, 6, 7],\n    #\'colsample_bytree\': [0.7, 0.8, 0.9], # 0 ou 0,2 ou 0.7\n    #\'subsample\': [0.7, 0.8, 0.9],\n    #\'gamma\': [0.5, 1, 1.5, 2, 5],\n}\n\nmodel = XGBClassifier(random_state=2023)\n\n#f1_scorer = make_scorer(f1_score, average=\'weighted\')\n\ngrid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, cv=15, verbose=0, n_jobs=-1, error_score="raise") # scoring=\'f1\'\ngrid_search.fit(X_train, y_train)\n'

Inspect the best parameters

In [484]:
#grid_search.best_params_ # Melhor : {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}

In [485]:
##grid_search.best_score_ # 0.0.8936697222898247 // 0.8936689393880897

Get the best estimator

In [486]:
#model = grid_search.best_estimator_

Get the predictions using the trained model

In [487]:
#predictions = model.predict(X)

##### Model

In [488]:
model = XGBClassifier(random_state=2023, learning_rate=0.01, colsample_bytree=0.9, max_depth=4, n_estimators=300, subsample=0.7)

In [489]:
model.fit(X, y)

#### Model Evaluation

In [490]:
scores = cross_val_score(model, X, y, cv=10, n_jobs=-1)

In [491]:
print(scores)

[0.83393829 0.85753176 0.88203267 0.88203267 0.89201452 0.86842105
 0.8773842  0.90190736 0.86103542 0.88192552]


In [492]:
print("Mean accuracy:", scores.mean())

Mean accuracy: 0.8738223459616814


In [493]:
df_test = pd.read_csv('../../../datasets/parte2/test_prepared.csv')
predictions_test = model.predict(df_test)

In [494]:
df_predictions = pd.DataFrame({'Result': predictions_test})
df_predictions['RowId'] = range(1, len(predictions_test) + 1)
df_predictions = df_predictions[['RowId', 'Result']]

replace_map = { 0:'None', 1:'Low', 2:'Medium', 3:'High', 4:'Very High'} 

df_predictions['Result'] = df_predictions['Result'].replace(replace_map)

df_predictions.to_csv('../../../datasets/parte2/kaggle-xgboost-sem4-semmonth-semwind_speed-ohe-configmodel.csv', index=False)