# XGBoost

#### Import libraries

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score, make_scorer, f1_score

#### Read data

In [4]:
df = pd.read_csv('../../../datasets/parte2/train_prepared.csv', na_filter=False)

#### Turn categorical features into numeric

In [5]:
replace_map = {'None':0, 'Low':1, 'Medium':2, 'High':3, 'Very High':4}

df['injection'] = df['injection'].replace(replace_map).astype(int)

#### X and y arrays

In [6]:
X = df.drop(['injection'], axis=1)
y = df['injection']

#### Training 

##### Grid Search

Using GridSearchCV to find the best hyperparameters

In [7]:

param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 5, 7],
    #'min_child_weight': [5, 6, 7],
    'colsample_bytree': [0.8, 0.9], # 0 ou 0,2 ou 0.7
    'subsample': [0.7, 0.8],
    #'gamma': [0.5, 1, 1.5, 2, 5],
}

model = XGBClassifier(random_state=2023)

#f1_scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, cv=10, verbose=0, n_jobs=-1, error_score="raise") # scoring='f1'
grid_search.fit(X, y)


KeyboardInterrupt: 

Inspect the best parameters

In [None]:
grid_search.best_params_ # Melhor : {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}

In [None]:
grid_search.best_score_ # 0.0.8936697222898247 // 0.8936689393880897

Get the best estimator

In [None]:
model = grid_search.best_estimator_

Get the predictions using the trained model

In [None]:
predictions = model.predict(X)

##### Model

In [None]:
#model = XGBClassifier(random_state=2023, learning_rate=0.01, colsample_bytree=0.9, max_depth=4, n_estimators=300, subsample=0.7)

In [None]:
#model.fit(X, y)

#### Model Evaluation

In [None]:
#scores = cross_val_score(model, X, y, cv=10, n_jobs=-1)

In [None]:
#print(scores)

In [None]:
#print("Mean accuracy:", scores.mean())

In [None]:
df_test = pd.read_csv('../../../datasets/parte2/test_prepared.csv')
predictions_test = model.predict(df_test)

In [None]:
df_predictions = pd.DataFrame({'Result': predictions_test})
df_predictions['RowId'] = range(1, len(predictions_test) + 1)
df_predictions = df_predictions[['RowId', 'Result']]

replace_map = { 0:'None', 1:'Low', 2:'Medium', 3:'High', 4:'Very High'} 

df_predictions['Result'] = df_predictions['Result'].replace(replace_map)

df_predictions.to_csv('../../../datasets/parte2/kaggle-xgboost-sem4-semmonth-semwind_speed-newcolumns-ohe.csv', index=False)