In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import linear_model

X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(90, 4) (90,)
(60, 4) (60,)


In [3]:
clf = linear_model.LogisticRegression(C=100, max_iter=10000).fit(X_train, y_train)
clf.score(X_test, y_test)

0.95

In [4]:
clf.score(X_train, y_train)

0.9888888888888889

In [3]:
from sklearn.model_selection import cross_val_score
clf = linear_model.LogisticRegression(C=10, max_iter=10000)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.93333333, 1.        ])

In [8]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.97 accuracy with a standard deviation of 0.02


In [9]:
clf = linear_model.LogisticRegression(C=10, max_iter=10000)
scores = cross_val_score(clf, X, y, cv=5, scoring='precision_macro')
scores

array([0.96969697, 1.        , 0.96969697, 0.93333333, 1.        ])

In [None]:
from sklearn.model_selection import cross_validate
scoring = ['precision_macro', 'recall_macro']
clf = linear_model.LogisticRegression(max_iter=1000)
scores = cross_validate(clf, X, y, scoring=scoring)
sorted(scores.keys())

['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

In [12]:
scores

{'fit_time': array([0.04045582, 0.05144167, 0.0393579 , 0.0485363 , 0.05150151]),
 'score_time': array([0.00246596, 0.00984311, 0.00212932, 0.00099492, 0.0095396 ]),
 'test_precision_macro': array([0.96969697, 1.        , 0.94444444, 0.96969697, 1.        ]),
 'test_recall_macro': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])}

In [13]:
scores['test_precision_macro']

array([0.96969697, 1.        , 0.94444444, 0.96969697, 1.        ])

In [11]:
from sklearn.model_selection import cross_val_predict
clf = linear_model.LogisticRegression(max_iter=1000)
predictions = cross_val_predict(clf, X, y)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
import numpy as np

X, y = load_iris(return_X_y=True)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    print(f"Fold {fold}: Accurady = {acc:.4f}")
    accuracies.append(acc)

Fold 1: Accurady = 1.0000
Fold 2: Accurady = 1.0000
Fold 3: Accurady = 0.9333
Fold 4: Accurady = 0.9667
Fold 5: Accurady = 0.9667


In [18]:
print(f"\nAverage Accuracy: {np.mean(accuracies):.4f}")


Average Accuracy: 0.9733


In [19]:
from sklearn.model_selection import LeaveOneOut

a = [45, 76, 34, 32, 21]
loo = LeaveOneOut()
for train, test in loo.split(a):
    print(train, test)

[1 2 3 4] [0]
[0 2 3 4] [1]
[0 1 3 4] [2]
[0 1 2 4] [3]
[0 1 2 3] [4]


In [23]:
from sklearn.model_selection import RepeatedKFold

a = [45, 76, 34, 32, 21, 65, 43, 32, 21]
rkf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=42)

for train, test in rkf.split(a):
    print(train, test)

[0 2 3 4 5 6 8] [1 7]
[1 2 3 4 6 7 8] [0 5]
[0 1 3 4 5 6 7] [2 8]
[0 1 2 5 6 7 8] [3 4]
[0 1 2 3 4 5 7 8] [6]


In [25]:
from sklearn.model_selection import LeavePOut

a = [45, 76, 34, 32, 21, 65, 43, 32, 21]
lpo = LeavePOut(p=4)

for train, test in lpo.split(a):
    print(train, test)

[4 5 6 7 8] [0 1 2 3]
[3 5 6 7 8] [0 1 2 4]
[3 4 6 7 8] [0 1 2 5]
[3 4 5 7 8] [0 1 2 6]
[3 4 5 6 8] [0 1 2 7]
[3 4 5 6 7] [0 1 2 8]
[2 5 6 7 8] [0 1 3 4]
[2 4 6 7 8] [0 1 3 5]
[2 4 5 7 8] [0 1 3 6]
[2 4 5 6 8] [0 1 3 7]
[2 4 5 6 7] [0 1 3 8]
[2 3 6 7 8] [0 1 4 5]
[2 3 5 7 8] [0 1 4 6]
[2 3 5 6 8] [0 1 4 7]
[2 3 5 6 7] [0 1 4 8]
[2 3 4 7 8] [0 1 5 6]
[2 3 4 6 8] [0 1 5 7]
[2 3 4 6 7] [0 1 5 8]
[2 3 4 5 8] [0 1 6 7]
[2 3 4 5 7] [0 1 6 8]
[2 3 4 5 6] [0 1 7 8]
[1 5 6 7 8] [0 2 3 4]
[1 4 6 7 8] [0 2 3 5]
[1 4 5 7 8] [0 2 3 6]
[1 4 5 6 8] [0 2 3 7]
[1 4 5 6 7] [0 2 3 8]
[1 3 6 7 8] [0 2 4 5]
[1 3 5 7 8] [0 2 4 6]
[1 3 5 6 8] [0 2 4 7]
[1 3 5 6 7] [0 2 4 8]
[1 3 4 7 8] [0 2 5 6]
[1 3 4 6 8] [0 2 5 7]
[1 3 4 6 7] [0 2 5 8]
[1 3 4 5 8] [0 2 6 7]
[1 3 4 5 7] [0 2 6 8]
[1 3 4 5 6] [0 2 7 8]
[1 2 6 7 8] [0 3 4 5]
[1 2 5 7 8] [0 3 4 6]
[1 2 5 6 8] [0 3 4 7]
[1 2 5 6 7] [0 3 4 8]
[1 2 4 7 8] [0 3 5 6]
[1 2 4 6 8] [0 3 5 7]
[1 2 4 6 7] [0 3 5 8]
[1 2 4 5 8] [0 3 6 7]
[1 2 4 5 7] [0 3 6 8]
[1 2 4 5 6

In [26]:
from sklearn.model_selection import ShuffleSplit
X = np.arange(10)
ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
for train_index, test_index in ss.split(X):
    print("%s %s" % (train_index, test_index))

[9 1 6 7 3 0 5] [2 8 4]
[2 9 8 0 6 7 4] [3 5 1]
[4 5 1 0 6 9 7] [2 3 8]
[2 7 5 8 0 3 4] [6 1 9]
[4 1 0 6 8 9 3] [5 2 7]


In [27]:
X

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [32]:
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))
    
print()
kf = KFold(n_splits=3)
for train, test in kf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))

train -  [30  3]   |   test -  [15  2]
train -  [30  3]   |   test -  [15  2]
train -  [30  4]   |   test -  [15  1]

train -  [28  5]   |   test -  [17]
train -  [28  5]   |   test -  [17]
train -  [34]   |   test -  [11  5]


In [30]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1])