In [1]:
import pandas as pd

import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Train

Neste notebook iremos realizar o treinamento de um modelo de machine learning para que a partir dos dados de uma carta de God Unchained seja possível classificar sua estratégia.

## Tabela de Conteúdo

1. [Feature Engineering](#feature_engineering)
2. [Treinamento](#train)
    * [Random Forest](#random_forest)
    * [XGBoost](#xgboost)
3. [Avaliação dos Modelos](#model_evaluation)

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
X = df.drop(["id", "name", "strategy"], axis = 1)
y = df["strategy"].map({"early": 0, "late": 1})

### Funções Auxiliares

In [4]:
def get_selected_features_names(grid_search):
    feature_names = grid_search.best_estimator_[0].get_feature_names()
    selected_features_bool_index = grid_search.best_estimator_[1].get_support()
    
    features_dict = dict(zip(feature_names, selected_features_bool_index))
    selected_features_names = [key for key, value in features_dict.items() if value]
    
    return selected_features_names

def create_grid_search_report(grid_search_dict):
    rows = []
    for model_name, grid_search in grid_search_dict.items():
        values = {
            "model_name": model_name,
            "selected_features": get_selected_features_names(grid_search),
            "best_score": grid_search.best_score_,
            "best_params": grid_search.best_params_
        }
        rows.append(values)
        
    return pd.DataFrame(rows)

## Feature Engineering <a name="feature_engineering"></a>

O primeiro passo para realizarmos o treinamento do modelo é fazer o tratamento dos dados. Na etapa abaixo iremos transformar as váraveis categoricas **type** e **god** em colunas binárias utizando uma técnica chamada de **One Hot Encoding**.

In [5]:
preprocess_column_transfomer = ColumnTransformer(
    transformers = [
        ("One Hot Encoder Type", OneHotEncoder(), [3]),
        ("One Hot Encoder God", OneHotEncoder(), [4]),
    ],
    remainder="passthrough"
)

## Treinamento <a name="train"></a>

O treinamento do nosso modelo será feito utilizando o Sklearn pipeline e irá possuir três etapas:
* Pré-processamento
* Feature Selection
* Modelos

E a partir do pipeline, rodaremos um **Grid Search** com a técnica de **Cross Validation**, pará escolher o modelo com melhores parâmetros.

## Random Forest <a name="random_forest"></a>

In [6]:
random_forest_model = RandomForestClassifier()
random_forest_pipeline = Pipeline(
    steps = [
        ("preprocessing", preprocess_column_transfomer),
        ("feature_selection", SelectFromModel(random_forest_model)),
        ("random_forest", random_forest_model),
    ]
)

random_forest_grid_search = GridSearchCV(
    random_forest_pipeline,
    param_grid={
        "random_forest__bootstrap": [True, False], 
        "random_forest__n_estimators": [10, 100],
        "random_forest__max_features": ['auto', 'sqrt'],
        "random_forest__max_depth": [10, 50],
        "random_forest__min_samples_split": [5, 10],
        "random_forest__min_samples_leaf": [1, 4]
    },
    cv=3,
    scoring='roc_auc',
    n_jobs=-1
)

random_forest_grid_search.fit(X, y)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('One '
                                                                         'Hot '
                                                                         'Encoder '
                                                                         'Type',
                                                                         OneHotEncoder(),
                                                                         [3]),
                                                                        ('One '
                                                                         'Hot '
                                                                         'Encoder '
                                                                         'God',
                       

## XGBoost <a name="xgboost"></a>

In [7]:
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgboost_pipeline = Pipeline(
    steps = [
        ("preprocessing", preprocess_column_transfomer),
        ("feature_selection", SelectFromModel(xgboost_model)),
        ("xgboost", xgboost_model),
    ]
)

xgboost_grid_search = GridSearchCV(
    xgboost_pipeline,
    param_grid = {
        "xgboost__min_child_weight": [1, 5, 10], 
        "xgboost__gamma": [1, 2, 5],
        "xgboost__subsample": [0.6, 0.8, 1.0],
        "xgboost__max_depth": [3, 4, 5],
    },
    cv = 3,
    scoring='roc_auc',
    n_jobs=-1
)

xgboost_grid_search.fit(X, y)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('One '
                                                                         'Hot '
                                                                         'Encoder '
                                                                         'Type',
                                                                         OneHotEncoder(),
                                                                         [3]),
                                                                        ('One '
                                                                         'Hot '
                                                                         'Encoder '
                                                                         'God',
                       

## Avaliação dos Modelos <a name="model_evaluation"></a>

Para a avaliação dos modelos iremos utilizar a metrica chamada **Curva Roc**, que tem como principal objetivo medir a acertividade do modelo.

In [8]:
create_grid_search_report({
        "random_forest": random_forest_grid_search,
        "xgboost": xgboost_grid_search
})

Unnamed: 0,model_name,selected_features,best_score,best_params
0,random_forest,"[mana, attack, health]",0.999102,"{'random_forest__bootstrap': True, 'random_for..."
1,xgboost,"[mana, attack, health]",0.998525,"{'xgboost__gamma': 1, 'xgboost__max_depth': 4,..."


Como podemos ver o modelo que teve o melhor resultado foi a **Random Forest** com a **Curva Roc** de 0.99.