# K-Fold Cross Validation

In [31]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()


A single train/test split is made easy with the train_test_split function in the cross_validation library:

In [32]:
# Split the iris data into train/test data sets with 40% reserved for testing
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

# Build an SVC model for predicting iris classifications using training data
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

# Now measure its performance with the test data
clf.score(X_test, y_test)   

0.9666666666666667

K-Fold cross validation is just as easy; let's use a K of 5:

In [33]:
# We give cross_val_score a model, the entire data set and its "real" values, and the number of folds:
scores = cross_val_score(clf, iris.data, iris.target, cv=5)

# Print the accuracy for each fold:
print(scores)

# And the mean accuracy of all 5 folds:
print(scores.mean())

[0.96666667 1.         0.96666667 0.96666667 1.        ]
0.9800000000000001


Our model is even better than we thought! Can we do better? Let's try a different kernel (poly):

In [34]:
clf = svm.SVC(kernel='poly', degree=2, C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
print(scores)
print(scores.mean())

[0.96666667 1.         1.         0.96666667 1.        ]
0.9866666666666667


No! The more complex polynomial kernel produced lower accuracy than a simple linear kernel. The polynomial kernel is overfitting. But we couldn't have told that with a single train/test split:

In [35]:
# Build an SVC model for predicting iris classifications using training data
clf = svm.SVC(kernel='poly', degree=2, C=1).fit(X_train, y_train)

# Now measure its performance with the test data
clf.score(X_test, y_test)   

0.95

That's the same score we got with a single train/test split on the linear kernel.

# Activity

In [133]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("titanic.csv", header = 0)
df = df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]

emb = {"S": 1, "C":2, "Q":3}
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Embarked"] = df["Embarked"].map(lambda x: emb[x] if x in emb else 0)
df["Sex"] = df["Sex"].map(lambda x: 1 if "female" in x else 0)

data = df[df.columns[1:]]
target = df["Survived"]

kfold = KFold(n_splits=10, random_state=7, shuffle=True)

### KFoldCrossValidation for Random Forest

In [135]:
clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, data, target, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

Accuracy: 80.81% (5.58%)


### KFoldCrossValidation for XGBoost

In [136]:
import xgboost

model = xgboost.XGBClassifier()

results = cross_val_score(model, data, target, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 81.03% (4.60%)
