In [75]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
import seaborn as sns
import pandas as pd

In [36]:
N = 30
k = 4
k_offset = 5
b = 0.4

K = k + k_offset
B = np.array([b] * k_offset + [0] * k)

In [47]:
X = np.random.multivariate_normal(np.zeros(K), np.eye(K), size=N)
p = 1 / (1 + np.exp(-np.dot(X, B)))
y = np.random.binomial(1, p=p)

In [48]:
y

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1])

### Whole dataset

In [60]:
lr = LogisticRegression()
lr.fit(X, y)
y_pred_lr = lr.predict(X)
print(confusion_matrix(y, y_pred_lr))

dt = DecisionTreeClassifier()
dt.fit(X, y)
y_pred_dt = dt.predict(X)
print(confusion_matrix(y, y_pred_dt))

[[19  1]
 [ 5  5]]


array([[20,  0],
       [ 0, 10]])

### 10-fold

In [76]:
kf = KFold(n_splits=10)
kfold_data = {
    'tp': {'lr': [], 'dt': []},
    'tn': {'lr': [], 'dt': []},
    'fp': {'lr': [], 'dt': []},
    'fn': {'lr': [], 'dt': []}
}
for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for model_class, name in [(LogisticRegression, 'lr'), (DecisionTreeClassifier, 'dt')]:
        model = model_class()
        model.fit(X_train, y_train)
        y_pred = np.round(model.predict(X_test))
        tp = np.sum((y_test == 1) & (y_test == y_pred))
        tn = np.sum((y_test == 1) & (y_test == y_pred))
        fp = np.sum((y_pred == 1) & (y_test != y_pred))
        fn = np.sum((y_pred == 1) & (y_test != y_pred))
        kfold_data['tp'][name].append(tp)
        kfold_data['tn'][name].append(tn)
        kfold_data['fp'][name].append(fp)
        kfold_data['fn'][name].append(fn)
kfold_data = pd.DataFrame(kfold_data)

In [69]:
sns.boxplot(kfold_data, x='')

array([False, False, False])

Unnamed: 0,tp,tn,fp,fn
lr,"[0, 0, 0, 0, 0, 1, 0, 1, 1, 1]","[0, 0, 0, 0, 0, 1, 0, 1, 1, 1]","[1, 2, 1, 0, 2, 1, 0, 0, 1, 0]","[1, 2, 1, 0, 2, 1, 0, 0, 1, 0]"
dt,"[0, 0, 0, 1, 0, 0, 1, 1, 1, 1]","[0, 0, 0, 1, 0, 0, 1, 1, 1, 1]","[1, 2, 0, 0, 2, 2, 1, 0, 1, 0]","[1, 2, 0, 0, 2, 2, 1, 0, 1, 0]"


In [78]:
np.sum((y_test == 1) & (y_test == y_pred))

np.int64(1)

In [83]:
kfold_data.reset_index()

Unnamed: 0,index,tp,tn,fp,fn
0,lr,"[0, 0, 0, 0, 0, 1, 0, 1, 1, 1]","[0, 0, 0, 0, 0, 1, 0, 1, 1, 1]","[1, 2, 1, 0, 2, 1, 0, 0, 1, 0]","[1, 2, 1, 0, 2, 1, 0, 0, 1, 0]"
1,dt,"[0, 0, 0, 1, 0, 0, 1, 1, 1, 1]","[0, 0, 0, 1, 0, 0, 1, 1, 1, 1]","[1, 2, 0, 0, 2, 2, 1, 0, 1, 0]","[1, 2, 0, 0, 2, 2, 1, 0, 1, 0]"
