# Cross Validation

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

boston = datasets.load_boston()
boston.data.shape, boston.target.shape

((506, 13), (506,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.4, random_state = 0)

X_train.shape, y_train.shape

X_test.shape, y_test.shape

regression = svm.SVR(kernel = 'linear', C=1).fit(X_train, y_train)
regression.score(X_test, y_test)

0.667431382173115

## Computing Cross-Validation Metrics

In [4]:
from sklearn.model_selection import cross_val_score
regression = svm.SVR(kernel='linear', C=1)
scores = cross_val_score(regression, boston.data, boston.target, cv=5)
scores

array([0.77285459, 0.72771739, 0.56131914, 0.15056451, 0.08212844])

In [6]:
print(('Accuracy: %0.2f (+/- %0.2f)') % (scores.mean(), scores.std() * 2))

Accuracy: 0.46 (+/- 0.58)


In [7]:
from sklearn import metrics
scores = cross_val_score(regression, boston.data, boston.target, cv=5, scoring = 'neg_mean_squared_error')
scores

array([ -7.84451123, -24.78772444, -35.13272326, -74.50555945,
       -24.40465975])

## K-Fold

In [8]:
import numpy as np
from sklearn.model_selection import KFold

X = ['a','b','c','d']
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print('%s %s' % (train, test))

[2 3] [0 1]
[0 1] [2 3]


## Stratified K-Fold

In [9]:
from sklearn.model_selection import StratifiedKFold

X= np.ones(10)
y = [0,0,0,0,1,1,1,1,1,1]
skf = StratifiedKFold(n_splits = 3)
for train, test in skf.split(X,y):
    print('%s %s' % (train, test))

[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.pipeline import make_pipeline

pipe_svm = make_pipeline(StandardScaler(), PCA(n_components = 2), svm.SVR(kernel='linear', C=1))
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('Test Accuracy: %.3f' % pipe_svm.score(X_test, y_test))

Test Accuracy: 0.391


In [13]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator = pipe_svm, X=X_train, y=y_train, cv=10, n_jobs = 1)
print('CV Accuracy Scores: %s' % scores)

CV Accuracy Scores: [0.63971176 0.43579197 0.46977821 0.25027246 0.5124364  0.26221374
 0.30877195 0.54528563 0.37810066 0.47313549]


In [14]:
print('CV Accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV Accuracy: 0.428 +/- 0.121


In [16]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2, random_state = 0)

X_train.shape, y_train.shape

X_test.shape, y_test.shape

regression = svm.SVR(kernel = 'linear', C=1).fit(X_train, y_train)
regression.score(X_test, y_test)

0.5159547464307443

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.pipeline import make_pipeline

pipe_svm = make_pipeline(StandardScaler(), PCA(n_components = 2), svm.SVR(kernel='linear', C=1))
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('Test Accuracy: %.3f' % pipe_svm.score(X_test, y_test))

Test Accuracy: 0.146


In [18]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator = pipe_svm, X=X_train, y=y_train, cv=10, n_jobs = 1)
print('CV Accuracy Scores: %s' % scores)
print('CV Accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV Accuracy Scores: [0.65981167 0.61709376 0.438185   0.46467199 0.3370698  0.36278883
 0.27798527 0.30885508 0.38347668 0.53835719]
CV Accuracy: 0.439 +/- 0.124
