# Advanced Cross Validation 

Performing model valuation within Scikit Learn

In [1]:
# We import scikit
import pandas as pd
import numpy as np

## Generate Some Data

In [2]:
m=10

In [3]:
X_values = np.random.random((m, 3))

In [4]:
X_values

array([[0.98749173, 0.67183366, 0.12754699],
       [0.80060893, 0.57611624, 0.83301417],
       [0.38739703, 0.64954404, 0.65436108],
       [0.52766839, 0.8338813 , 0.21859448],
       [0.36437211, 0.47358466, 0.48338047],
       [0.40121598, 0.83794171, 0.92113663],
       [0.98487475, 0.73635022, 0.08225336],
       [0.41306886, 0.21155562, 0.86512368],
       [0.72068926, 0.439894  , 0.83437099],
       [0.80696435, 0.90043223, 0.28814901]])

In [5]:
y = np.random.random((m, 1))

In [6]:
type(y)

numpy.ndarray

In [7]:
# We can extract the data and put it into a dataframe
X = pd.DataFrame(X_values)

## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Now we split the data with 30% for testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1278)

In [10]:
print("Training data:", X_train.shape, y_train.shape,)
print("Test data:", X_test.shape, y_test.shape)

Training data: (7, 3) (7, 1)
Test data: (3, 3) (3, 1)


In [11]:
X_train

Unnamed: 0,0,1,2
3,0.527668,0.833881,0.218594
7,0.413069,0.211556,0.865124
8,0.720689,0.439894,0.834371
1,0.800609,0.576116,0.833014
4,0.364372,0.473585,0.48338
0,0.987492,0.671834,0.127547
5,0.401216,0.837942,0.921137


In [12]:
X_test

Unnamed: 0,0,1,2
6,0.984875,0.73635,0.082253
9,0.806964,0.900432,0.288149
2,0.387397,0.649544,0.654361


In [13]:
y_train

array([[0.09755054],
       [0.38553342],
       [0.51219606],
       [0.19306146],
       [0.17478774],
       [0.16579516],
       [0.72284833]])

In [14]:
y_test

array([[0.75807163],
       [0.76864126],
       [0.85216641]])

## K-Fold Cross Validation

In [15]:
from sklearn.model_selection import KFold, cross_val_score

### Trivial Case to Show Split

In [16]:
X = ["a", "a", "a", "b", "b", "c", "c", "c", "c", "c"]
k_fold = KFold(n_splits=5)
for train_indices, test_indices in k_fold.split(X):
    print('Train: %s | test: %s' % (train_indices, test_indices))

Train: [2 3 4 5 6 7 8 9] | test: [0 1]
Train: [0 1 4 5 6 7 8 9] | test: [2 3]
Train: [0 1 2 3 6 7 8 9] | test: [4 5]
Train: [0 1 2 3 4 5 8 9] | test: [6 7]
Train: [0 1 2 3 4 5 6 7] | test: [8 9]


### Training a Support Vector Classifier to classify the Digits Dataset

In [17]:
# Load digits dataset
from sklearn import datasets, svm
digits = datasets.load_digits()

In [18]:
X_digits = digits.data
y_digits = digits.target

In [19]:
X_digits.shape

(1797, 64)

In [20]:
svc = svm.SVC(C=1, kernel='linear')

In [21]:
# Calibrate model using all but the last 100 rows of the datasets
model = svc.fit(X_digits[:-100], y_digits[:-100])

In [22]:
# Test the model using the last 100 rows
model.score(X_digits[-100:], y_digits[-100:])

0.98

In [23]:
k_fold = KFold(n_splits=5)

In [24]:
# Loop over different validation sets
for train, test in k_fold.split(X_digits):
    print(train.shape,test.shape)
    svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])

(1437,) (360,)
(1437,) (360,)
(1438,) (359,)
(1438,) (359,)
(1438,) (359,)


In [25]:
res = cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1)

In [26]:
print(res)

[0.96388889 0.92222222 0.9637883  0.9637883  0.93036212]


In [27]:
res.mean()

0.9488099659548128