# k-Fold Cross-Validation

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.svm import SVC
import matplotlib.pyplot as plt

In [2]:
iris = datasets.load_iris()
data = iris.data

- Apply k-Fold Cross Validation to identify the optimal parameters for C and gamma for a support vector machine classifier

In [3]:
def train_test_split(X, y, shuffle=True, test_size=0.3):
    """
    Split data into training and testing set.
    """
    n_train = int(X.shape[0]*(1-test_size))
    indices = np.arange(len(X))
    if shuffle: 
        np.random.shuffle(indices)
    train = indices[:n_train]
    test = indices[n_train:]
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    return X_train, X_test, y_train, y_test

In [4]:
# train test split
X, y = iris.data, iris.target
# use custom train test split function
X_train,X_test,y_train,y_test=train_test_split(X=X, y=y, shuffle=True, test_size=0.3)

In [5]:
# function to compute accuracy
def get_accuracy(y_pred, y_test):
    """
    Takes y_test and y_pred as input and
    returns accuracy.
    """
    n_accurate = (y_pred == y_test).sum()
    n_total = len(y_test)
    accuracy = n_accurate / n_total
    return accuracy

In [6]:
# function to prepare k-fold indices
def prepare_k_fold(X,n_splits=5, shuffle=True):
    """
    Takes dataset as input and returns k chunks
    of indices which can be used for k-fold
    cross validation.
    """
    # generate index with length of X_train
    indices = np.arange(len(X))
    if shuffle:
        np.random.shuffle(indices)
    # split index into k chunks
    k_chunks = np.array_split(indices, n_splits)
    
    return k_chunks

In [7]:
# lists of possible C and gamam values
C_list = [0.001,0.01,0.1,1,10]
gamma_list = [0.01, 0.1, 1, 10]
# define k
k_fold = 5
# initiate list to store average accuracy
accuracy_list = []
np.random.seed(42)

In [8]:
# iterate over C_list
for C in C_list:
    # iterate over gamma_list
    for gamma in gamma_list:

        # instantiate classifier with respective C and gamma
        classifier = SVC(kernel='rbf', C=C, gamma=gamma, random_state=42)
        # initiate 5 fold cross validation; splits data into sub train test
        k_chunks = prepare_k_fold(X=X_train, n_splits=k_fold, shuffle=True)
        
        # run k-fold cross validation
        scores = [] # empty list to store the k accuracies in order to compute the mean
        # iterate over chunks
        for chunk in k_chunks:
            # all train indices
            indices = np.arange(len(X_train))
            # take values from chunk as sub test index
            sub_test_index = chunk
            # take values which are not in chunk as sub train index
            sub_train_index = np.array([i for i in indices if i not in chunk])
            # fit model on train subset
            classifier.fit(X_train[sub_train_index], y_train[sub_train_index])
            # predict on test subset (chunk)
            sub_y_pred = classifier.predict(X_train[sub_test_index])
            # true values (chunk)
            sub_y_test = y_train[sub_test_index]
            # call get_accuracy function
            accuracy_score = get_accuracy(y_pred=sub_y_pred, y_test=sub_y_test)
            #append to list of accuracies of respective chunk
            scores.append(accuracy_score)
            
        # take mean of scores of respective chunk
        accuracy = np.mean(scores)
        #store result to accuracy_list
        result = np.array([C,gamma,accuracy])
        accuracy_list.append(result)

In [9]:
# create df with accuracy results
cross_val_results = pd.DataFrame(accuracy_list, columns=['C', 'gamma', 'accuracy'])
cross_val_results

Unnamed: 0,C,gamma,accuracy
0,0.001,0.01,0.295238
1,0.001,0.1,0.314286
2,0.001,1.0,0.238095
3,0.001,10.0,0.438095
4,0.01,0.01,0.228571
5,0.01,0.1,0.333333
6,0.01,1.0,0.27619
7,0.01,10.0,0.228571
8,0.1,0.01,0.590476
9,0.1,0.1,0.895238


In [10]:
# values which scored the highest accuracy
best_values = cross_val_results.loc[cross_val_results.accuracy.argmax()]
best_values

C           0.100000
gamma       1.000000
accuracy    0.971429
Name: 10, dtype: float64

In [11]:
# instantiate model using best values
classifier = SVC(kernel='rbf', 
                 C=best_values.C, # best C
                 gamma=best_values.gamma, # best gamma
                 random_state=42)

# fit on entire train data
classifier.fit(X_train, y_train)
# predict test data
y_pred = classifier.predict(X_test)

In [12]:
# apply get_accuracy function
acc = get_accuracy(y_pred, y_test)
print("Accuracy of optimized SVC model {:.2%}".format(acc))

Accuracy of optimized SVC model 93.33%
