# XGBoost

#### Import libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score, make_scorer, f1_score

#### Read data

In [None]:
df = pd.read_csv('../../../datasets/parte2/train_prepared.csv', na_filter=False)

#### Turn categorical features into numeric

In [None]:
replace_map = {'None':0, 'Low':1, 'Medium':2, 'High':3, 'Very High':4}

df['Injeção na rede (kWh)'] = df['Injeção na rede (kWh)'].replace(replace_map).astype(int)

#### X and y arrays

In [None]:
X = df.drop(['Injeção na rede (kWh)'], axis=1)
y = df['Injeção na rede (kWh)']

#### Training 

##### Grid Search

Using GridSearchCV to find the best hyperparameters

In [None]:
param_grid = {
    'max_depth': [4], 
    'min_child_weight': [1, 2, 3],  
    'subsample': [0.5, 0.6, 0.7],  
    'colsample_bytree': [0.6, 0.8, 1], 
    'learning_rate': [0.01, 0.1],  
    'n_estimators': [100],   # 100
}

model = XGBClassifier(random_state=2023)

#f1_scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, cv=10, verbose=0, n_jobs=-1, error_score="raise") # scoring='f1'
grid_search.fit(X, y)

Inspect the best parameters

In [None]:
grid_search.best_params_ # Melhor : {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}

In [None]:
grid_search.best_score_ 

Get the best estimator

In [None]:
model = grid_search.best_estimator_

Get the predictions using the trained model

In [None]:
#predictions = model.predict(X)

##### Model

In [None]:
#model = XGBClassifier(random_state=2023, learning_rate=0.01, colsample_bytree=0.9, max_depth=4, n_estimators=300, subsample=0.7)

In [None]:
#model.fit(X, y)

#### Model Evaluation

In [None]:
#scores = cross_val_score(model, X, y, cv=10, n_jobs=-1)

In [None]:
#print(scores)

In [None]:
#print("Mean accuracy:", scores.mean())

In [None]:
df_test = pd.read_csv('../../../datasets/parte2/test_prepared.csv')
predictions_test = model.predict(df_test)

In [None]:
df_predictions = pd.DataFrame({'Result': predictions_test})
df_predictions['RowId'] = range(1, len(predictions_test) + 1)
df_predictions = df_predictions[['RowId', 'Result']]

replace_map = { 0:'None', 1:'Low', 2:'Medium', 3:'High', 4:'Very High'} 

df_predictions['Result'] = df_predictions['Result'].replace(replace_map)

df_predictions.to_csv('../../../datasets/parte2/kaggle-senra-sem-mes-cv10.csv', index=False)