In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

from pathlib import Path

In [21]:
from sklearn.preprocessing import LabelEncoder
def load_dataset(dataset_id):
    # Load a file, assuming the first (N - 1) columns are features, and the last column is the target value
    path = Path(f'data/uci/{dataset_id}')
    df = pd.read_csv(path / 'data.txt')
    arr = df.to_numpy()
    X, y = arr[:, :-1], arr[:, -1]
    le = LabelEncoder()
    return X, le.fit_transform(y)

In [22]:
X, y = load_dataset(43)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[:10], y[:10], test_size=0.2, random_state=0)
X_train.shape

(8, 3)

In [23]:
kfold = StratifiedKFold(n_splits=4, random_state=1, shuffle=True)

In [24]:
all_incorrect = set()
for data_split_idx, (train_idx, calibration_idx) in enumerate(kfold.split(X_train, y_train)):
    dt = DecisionTreeClassifier(max_depth=2, random_state=0)
    dt.fit(X_train[train_idx], y_train[train_idx])
    print('Calibration idx        ', calibration_idx)
    print('Calibration predictions', dt.predict(X_train[calibration_idx]))
    print('Ground truth           ', y_train[calibration_idx])
    
    wrong_idx_within_calibration = dt.predict(X_train[calibration_idx]) != y_train[calibration_idx]
    wrong_idx_within_training = calibration_idx[wrong_idx_within_calibration]
    print('Incorrect indices this pass', wrong_idx_within_training)
    all_incorrect.update(wrong_idx_within_training)

print('All incorrect indices', all_incorrect)

Calibration idx         [2 3]
Calibration predictions [0 1]
Ground truth            [0 0]
Incorrect indices this pass [3]
Calibration idx         [1 6]
Calibration predictions [1 0]
Ground truth            [0 0]
Incorrect indices this pass [1]
Calibration idx         [5 7]
Calibration predictions [1 1]
Ground truth            [0 0]
Incorrect indices this pass [5 7]
Calibration idx         [0 4]
Calibration predictions [0 0]
Ground truth            [0 1]
Incorrect indices this pass [4]
All incorrect indices {np.int64(1), np.int64(3), np.int64(4), np.int64(5), np.int64(7)}




In [32]:
list(all_incorrect)

[np.int64(1), np.int64(3), np.int64(4), np.int64(5), np.int64(7)]

In [33]:
np.array(list(all_incorrect))

array([1, 3, 4, 5, 7])

In [35]:
np.isin(np.arange(10), _33)

array([False,  True, False,  True,  True,  True, False,  True, False,
       False])

In [36]:
all_incorrect

{np.int64(1), np.int64(3), np.int64(4), np.int64(5), np.int64(7)}

In [37]:
test = np.zeros(shape=(10,))
test

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [40]:
test[ np.array(list(all_incorrect))] = 1
test

array([0., 1., 0., 1., 1., 1., 0., 1., 0., 0.])