# Stratified K-Fold

## Importing required libraries

In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate

## Load data 

In [2]:
df = pd.read_csv('../data/breast_cancer_cat.csv')

In [3]:
df.head(10)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,2,2,2,0,1,3,1,2,0,1
1,3,0,2,0,0,1,1,0,0,0
2,3,0,6,0,0,2,0,1,0,1
3,2,2,6,0,1,3,1,1,1,0
4,2,2,5,4,1,2,0,4,0,1
5,3,2,4,4,0,2,1,2,1,0
6,3,0,7,0,0,3,0,2,0,0
7,2,2,1,0,0,2,0,2,0,0
8,2,2,0,0,0,2,1,3,0,0
9,2,0,7,2,1,2,1,2,1,0


## Machine Learning 

### Define Output and Inputs

In [4]:
y = df['Class']
X = df.drop('Class', axis = 1)

In [5]:
X.shape, y.shape

((286, 9), (286,))

### Split dataset for Stratified KFold

In [6]:
skf = StratifiedKFold(n_splits=2)

In [7]:
indices_list = list(skf.split(X,y))
indices_list

[(array([143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
         156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
         169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
         182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
         195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
         208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
         221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
         234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
         247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
         260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
         273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285]),
  array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  3

In [8]:
X_train1, y_train1 = X.loc[indices_list[0][0]], y[indices_list[0][0]]
X_test1, y_test1 = X.loc[indices_list[0][1]], y[indices_list[0][1]]
X_train2, y_train2 = X.loc[indices_list[1][0]], y[indices_list[1][0]]
X_test2, y_test2 = X.loc[indices_list[1][1]], y[indices_list[1][1]]

In [9]:
X_train1.shape, y_train1.shape

((143, 9), (143,))

In [10]:
X_test1.shape, y_test1.shape

((143, 9), (143,))

In [11]:
X_train2.shape, y_train2.shape

((143, 9), (143,))

In [12]:
X_test2.shape, y_test2.shape

((143, 9), (143,))

### Logisitic Regression Model

##### First model

In [13]:
reg = LogisticRegression(solver='liblinear', C=1).fit(X_train1, y_train1)
reg.score(X_test1, y_test1)

0.7062937062937062

##### Second model

In [14]:
reg = LogisticRegression(solver='liblinear', C=1).fit(X_train2, y_train2)
reg.score(X_test2, y_test2)

0.6923076923076923

#### Cross Validation Score

In [15]:
clf = LogisticRegression(solver='liblinear', C=1)
cross_val_score(clf, X,y, cv=2)

array([0.70629371, 0.69230769])

### Precision & Recall

In [16]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [17]:
scoring = ['precision', 'recall']
cross_validate(clf, X, y, scoring=scoring, cv=2, return_train_score=True)

{'fit_time': array([0.00955367, 0.00601792]),
 'score_time': array([0.01209593, 0.00854445]),
 'test_precision': array([0.52380952, 0.46666667]),
 'train_precision': array([0.65217391, 0.65217391]),
 'test_recall': array([0.25581395, 0.33333333]),
 'train_recall': array([0.35714286, 0.34883721])}