# XGBoost

#### Import libraries

In [65]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score, make_scorer, f1_score

#### Read data

In [66]:
df = pd.read_csv('../../../datasets/parte2/train_prepared.csv', na_filter=False)

#### Turn categorical features into numeric

In [67]:
replace_map = {'None':0, 'Low':1, 'Medium':2, 'High':3,'Very High':4}

df['Injeção na rede (kWh)'] = df['Injeção na rede (kWh)'].replace(replace_map).astype(int)

#### X and y arrays

In [68]:
X = df.drop(['Injeção na rede (kWh)'], axis=1)
y = df['Injeção na rede (kWh)']

#### Training 

##### Grid Search

Using GridSearchCV to find the best hyperparameters

In [69]:
param_grid = {
    'max_depth': [4], 
    'min_child_weight': [1, 2, 3],  
    'subsample': [0.5, 0.6, 0.7],  
    'colsample_bytree': [0.6, 0.8, 1], 
    'learning_rate': [0.01, 0.1],  
    'n_estimators': [100],  
}

model = XGBClassifier(random_state=2023)

#f1_scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, cv=5, verbose=1, n_jobs=-1, error_score="raise") # scoring='f1'
grid_search.fit(X, y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Inspect the best parameters

In [70]:
grid_search.best_params_ # Melhor : {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}

{'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 4,
 'min_child_weight': 2,
 'n_estimators': 100,
 'subsample': 0.5}

In [71]:
grid_search.best_score_ # 0.0.8936697222898247 // 0.8936689393880897

0.8616576307015759

Get the best estimator

In [72]:
model = grid_search.best_estimator_

Get the predictions using the trained model

In [73]:
#predictions = model.predict(X)

##### Model

In [74]:
#model = XGBClassifier(random_state=2023, learning_rate=0.01, colsample_bytree=0.9, max_depth=4, n_estimators=300, subsample=0.7)

In [75]:
#model.fit(X, y)

#### Model Evaluation

In [76]:
#scores = cross_val_score(model, X, y, cv=10, n_jobs=-1)

In [77]:
#print(scores)

In [78]:
#print("Mean accuracy:", scores.mean())

In [79]:
df_test = pd.read_csv('../../../datasets/parte2/test_prepared.csv')
predictions_test = model.predict(df_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [80]:
df_predictions = pd.DataFrame({'Result': predictions_test})
df_predictions['RowId'] = range(1, len(predictions_test) + 1)
df_predictions = df_predictions[['RowId', 'Result']]

replace_map = { 0:'None', 1:'Low', 2:'Medium', 3:'High', 4:'Very High'} 

df_predictions['Result'] = df_predictions['Result'].replace(replace_map)

df_predictions.to_csv('../../../datasets/parte2/kaggle-xgboost-holidays.csv', index=False)