# Importar dependencias

In [1]:
import numpy as np

from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier

# Preparar dados

In [2]:
SEED=314

In [3]:
X, y = make_classification(n_samples=150, n_classes=2,
                           n_features=5, n_informative=3, n_redundant=0,
                           random_state=SEED)

In [4]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

# Model

In [5]:
model = RandomForestClassifier(random_state=SEED)

In [6]:
K=5
kf = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)

# Calcular score via `cross_val_score`
---

Nota: Apenas 1 métrica é permitida

In [7]:
cv_score = cross_val_score(estimator=model, 
                            X=X, y=y, cv=kf, scoring='roc_auc')

In [8]:
# output
cv_score

array([0.88461538, 0.78819444, 0.92361111, 0.87152778, 0.94444444])

In [9]:
print(f"AUC mean: {cv_score.mean()} \nAUC std: {cv_score.std()}")

AUC mean: 0.8824786324786326 
AUC std: 0.053936328560021486


In [10]:
model.fit(X, y);

y_pred = model.predict_proba(X_test)[:, 1]

roc_auc_score(y_test, y_pred)

0.9577777777777778

# Calcular scores via `cross_validate` 
---
Nota: Aceita mais de uma métrica

In [11]:
cv_scores = cross_validate(estimator=model, 
                            X=X, y=y, cv=kf, scoring=['roc_auc', 'neg_log_loss'])

In [12]:
# output
cv_scores

{'fit_time': array([0.1197176 , 0.1107018 , 0.11868072, 0.11473441, 0.11376762]),
 'score_time': array([0.00897169, 0.00897837, 0.00897789, 0.00794411, 0.00894237]),
 'test_roc_auc': array([0.88461538, 0.78819444, 0.92361111, 0.87152778, 0.94444444]),
 'test_neg_log_loss': array([-0.43658245, -0.5398032 , -0.42669698, -0.4932852 , -0.36195774])}

In [13]:
print(f"AUC mean: {cv_scores['test_roc_auc'].mean()} \nAUC std: {cv_scores['test_roc_auc'].std()}", 
      f"\nLogLoss mean: {-cv_scores['test_neg_log_loss'].mean()} \nLogLoss std: {cv_scores['test_neg_log_loss'].std()}")

AUC mean: 0.8824786324786326 
AUC std: 0.053936328560021486 
LogLoss mean: 0.4516651133509229 
LogLoss std: 0.06066112782136453


In [14]:
model.fit(X, y);

y_pred = model.predict_proba(X_test)[:, 1]

roc_auc_score(y_test, y_pred)

0.9577777777777778

# Calcular scores via loop for
---

Nota: Assim, torna-se possível fazer o kfold predict

In [15]:
oof = np.zeros(X.shape[0])
y_pred = np.zeros(X_test.shape[0])
train_scores = np.zeros(K)

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[val_idx]
    y_val = y[val_idx]
    
    model = RandomForestClassifier()
    
    model.fit(X_train, y_train)
    
    oof[val_idx] = model.predict_proba(X_val)[:, 1]
    y_pred += model.predict_proba(X_test)[:, 1] / K
    
    train_scores[fold] = roc_auc_score(y_test, y_pred)
    
print(f"Final AUC mean:", train_scores.mean())
print(f"Final AUC oof:", roc_auc_score(y, oof))

➜ FOLD :0
➜ FOLD :1
➜ FOLD :2
➜ FOLD :3
➜ FOLD :4
Final AUC mean: 0.9591111111111111
Final AUC oof: 0.8738538482911921


In [16]:
roc_auc_score(y_test, y_pred)

0.9622222222222222