In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, cross_val_predict, cross_validate
from sklearn.metrics import balanced_accuracy_score, accuracy_score, roc_auc_score, plot_confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import scale
import seaborn as sns
import lightgbm as lgb
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

sns.set_theme()

In [2]:
X, y = make_classification(n_samples=500, n_features=20, n_classes=3, random_state=1500, n_informative=10, n_redundant=10)

In [7]:
estimators = [
    {
        "name": "SVC",
        "estimator": SVC(kernel="rbf", probability=True, random_state=1500),
        "params_grid": {"C": [1, 10, 100], "gamma": [.01, .1]}
    },
    {
        "name": "DecisionTree",
        "estimator": DecisionTreeClassifier(random_state=1500),
        "params_grid": {'max_depth': [8, 80, 200, 500, None], 'min_samples_leaf': [1], 'class_weight': ['balanced']}
    },
    {
        "name": "RandomForest",
        "estimator": RandomForestClassifier(n_jobs=-1, random_state=1500),
        "params_grid": {'max_depth': [8, 80, 200, 500, None], 'min_samples_leaf': [1], 'class_weight': ['balanced']}
    },
    {
        "name": "LightGBM",
        "estimator": lgb.LGBMClassifier(n_jobs=1, objective="multiclass", random_state=1500),
        "params_grid": {'max_depth': [8, 80, 200, -1], 'class_weight': ['balanced']}
    }
]

In [4]:
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=44540570)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=45405450)

Nested cross validation "cl√°sico"

In [8]:
scores = {}
predictions = {}

for estimator in estimators:
    clf = GridSearchCV(
        estimator=estimator["estimator"],
        param_grid=estimator["params_grid"],
        cv=inner_cv,
        n_jobs=-1,
        scoring=['accuracy', 'roc_auc_ovo'],
        refit='accuracy'
    )
    
    nested_score = cross_validate(
        clf,
        X=X,
        y=y,
        cv=outer_cv,
        n_jobs=-1,
        scoring={
            'accuracy': 'accuracy',
            'balanced_accuracy': 'balanced_accuracy'
        }
    )
    
    nested_predict = cross_val_predict(
        clf,
        X=X,
        y=y,
        cv=outer_cv,
        n_jobs=-1,
        method='predict'
    )
        
    scores[estimator["name"]] = nested_score
    predictions[estimator["name"]] = nested_predict

In [9]:
scores

{'SVC': {'fit_time': array([0.45896983, 0.57016921, 0.49191403, 0.61561823, 0.61794615]),
  'score_time': array([0.00272274, 0.00263548, 0.00269818, 0.00197721, 0.00201488]),
  'test_accuracy': array([0.89, 0.8 , 0.85, 0.87, 0.79]),
  'test_balanced_accuracy': array([0.88948307, 0.79946524, 0.84937611, 0.87017231, 0.78995841])},
 'DecisionTree': {'fit_time': array([0.10182309, 0.11965895, 0.11392498, 0.10895419, 0.13918161]),
  'score_time': array([0.00087643, 0.00063491, 0.00067782, 0.00103617, 0.00065851]),
  'test_accuracy': array([0.63, 0.67, 0.73, 0.76, 0.69]),
  'test_balanced_accuracy': array([0.62923351, 0.66993464, 0.72935235, 0.76024955, 0.69043375])},
 'RandomForest': {'fit_time': array([2.86810422, 2.82535076, 2.86330771, 2.98826051, 2.93360996]),
  'score_time': array([0.02125502, 0.02839255, 0.03673148, 0.01135707, 0.01746297]),
  'test_accuracy': array([0.9 , 0.74, 0.84, 0.8 , 0.78]),
  'test_balanced_accuracy': array([0.89928699, 0.73915627, 0.8392751 , 0.80065359, 0.77

In [11]:
outer_results = list()
positions = list()

for train_ix, test_ix in outer_cv.split(X, y):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    
    clf_t = GridSearchCV(
        estimator=svm,
        param_grid={"C": [1, 10, 100], "gamma": [.01, .1]},
        cv=inner_cv,
        n_jobs=-1,
        scoring=['accuracy', 'roc_auc_ovo'],
        refit='accuracy'
    )    
    
    result = clf_t.fit(X_train, y_train)
    
    best_model = result.best_estimator_
    yhat = best_model.predict(X_test)
    #acc = accuracy_score(y_test, yhat)

    positions.append(test_ix)
    outer_results.append(yhat)

NameError: name 'svm' is not defined

In [7]:
b = np.array([])
for result in positions:
    b = np.concatenate((b, result), axis=0)
    
b = b.astype(int)

In [8]:
a = np.array([])
for result in outer_results:
    a = np.concatenate((a, result), axis=0)
    
a = a.astype(int)

In [9]:
accuracy_score(a[b.argsort()], y)

0.9691

In [10]:
accuracy_score(nested_predict, y)

0.9691

In [11]:
clf = GridSearchCV(
    estimator=svm,
    param_grid={"C": [1, 10, 100], "gamma": [.01, .1]},
    cv=inner_cv,
    n_jobs=-1,
    scoring=['accuracy', 'roc_auc_ovo'],
    refit='accuracy'
)

nested_score = cross_val_score(
    clf,
    X=X,
    y=y,
    cv=outer_cv,
    n_jobs=-1,
    scoring='accuracy'
)

In [12]:
nested_score

array([0.9755, 0.969 , 0.9665, 0.963 , 0.9715])

In [13]:
np.mean(nested_score)

0.9691000000000001

In [14]:
# Final model

In [15]:
clf.fit(X, y).best_score_

0.9672004093030778