In [1]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
import seaborn as sns
sns.set_theme(style="darkgrid")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

https://www.kaggle.com/competitions/icr-identify-age-related-conditions/data

In [2]:
train = pd.read_csv('train.csv', index_col=0)
train.shape

(617, 57)

In [3]:
X = train.drop(columns = ["Class"])
y = train["Class"]

Vamos seguir o exemplo de código encontrado em https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html.

In [4]:
numeric_features = X.select_dtypes(include=['float']).columns.to_list()
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_features = X.select_dtypes(include=['object']).columns.to_list()
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(drop='first')),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [5]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("forest", RandomForestClassifier())
    ]
)

In [6]:
clf

In [7]:
clf.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY',
                                     'AZ', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR',
                                     'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL',
                                     'CR', 'CS', 'CU', 'CW ', 'DA', 'DE', 'DF',
                                     'DH', 'DI', ...]),
                                   ('cat',
                                    Pipeline(steps=[('encoder',
                                                     OneHotEncoder(drop='first'))]),
                                    ['EJ'])])),
  ('forest', RandomForestClassifier())],
 'verbose': False,
 

In [8]:
param_grid = {
    'forest__n_estimators': [1000],
    'forest__criterion': ['gini'],
    'forest__max_depth': [None],
    'forest__min_samples_split': [2],
    'forest__min_samples_leaf': [1],
    'forest__min_weight_fraction_leaf': [0.0],
    'forest__max_features': ['sqrt'],
    'forest__max_leaf_nodes': [None],
    'forest__min_impurity_decrease': [0.0],
    'forest__bootstrap': [True],
    'forest__oob_score': [False],  
    'forest__n_jobs': [None],
    'forest__random_state': [42],
    'forest__verbose': [0],
    'forest__warm_start': [False],
    'forest__class_weight': [None],
    'forest__ccp_alpha': [0.0],
    'forest__max_samples': [None],
}

In [9]:
metrics = ['accuracy', 'f1', 'neg_log_loss', 'precision', 'recall', 'roc_auc',]

In [10]:
grid = GridSearchCV(clf,
                    param_grid=param_grid,
                    scoring=metrics,
                    verbose=1,
                    refit="neg_log_loss",
                    return_train_score=False)

In [11]:
grid.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [12]:
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_neg_log_loss')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__bootstrap,param_forest__ccp_alpha,param_forest__class_weight,param_forest__criterion,param_forest__max_depth,param_forest__max_features,param_forest__max_leaf_nodes,param_forest__max_samples,param_forest__min_impurity_decrease,param_forest__min_samples_leaf,param_forest__min_samples_split,param_forest__min_weight_fraction_leaf,param_forest__n_estimators,param_forest__n_jobs,param_forest__oob_score,param_forest__random_state,param_forest__verbose,param_forest__warm_start,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_test_neg_log_loss,split1_test_neg_log_loss,split2_test_neg_log_loss,split3_test_neg_log_loss,split4_test_neg_log_loss,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,split4_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,split4_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
0,0.227573,0.008458,0.023509,0.001646,True,0.0,,gini,,sqrt,,,0.0,1,2,0.0,100,,False,,0,False,"{'forest__bootstrap': True, 'forest__ccp_alpha': 0.0, 'forest__class_weight': None, 'forest__criterion': 'gini', 'forest__max_depth': None, 'forest__max_features': 'sqrt', 'forest__max_leaf_nodes': None, 'forest__max_samples': None, 'forest__min_impurity_decrease': 0.0, 'forest__min_samples_leaf': 1, 'forest__min_samples_split': 2, 'forest__min_weight_fraction_leaf': 0.0, 'forest__n_estimators': 100, 'forest__n_jobs': None, 'forest__oob_score': False, 'forest__random_state': None, 'forest__verbose': 0, 'forest__warm_start': False}",0.91129,0.903226,0.910569,0.878049,0.934959,0.907619,0.018239,1,0.666667,0.714286,0.744186,0.516129,0.789474,0.686148,0.093939,1,-0.221253,-0.244804,-0.265383,-0.278951,-0.216295,-0.245337,0.024313,1,1.0,0.75,0.761905,0.8,0.882353,0.838852,0.092937,1,0.5,0.681818,0.727273,0.380952,0.714286,0.600866,0.137063,1,0.979501,0.945187,0.915842,0.909664,0.957516,0.941542,0.026023,1
