# Cross validation methods
#### Importation of resources:

In [1]:
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, RepeatedStratifiedKFold
import numpy as np
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

seed=123


#### Loading the breast cancer dataset and storing the appropriate values for X and y in their corresponding variables.
+ rf = Random forest classifier
+ nb = Naive Bayes classifier

In [2]:
X,y = load_breast_cancer(return_X_y=True)

rf = RandomForestClassifier(random_state=seed)
nb = GaussianNB()

## Holdout cross validation
#### The dataset is split with 80% as training data and the other 20% as testing data.

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

#### The random forest classifier is trained using the X_train and y_train values before applying it to testing data where the percentage returned is how well the classifier can correctly predict y_test using X_test 

In [4]:
rf.fit(X_train, y_train)
print("The fitting of the model to the data: %.3f " % rf.score(X_train, y_train))
print("\nThe accuracy of the model: %.3f " % rf.score(X_test, y_test))

The fitting of the model to the data: 1.000 

The accuracy of the model: 0.991 


## K-fold Cross validation
+ kf = the k fold algorithm
+ n_split = the number of splits, in this case there are 10 splits
+ shuffle = data is shuffled when set to True
+ kfScoring = the list of accuracies in each fold  

In [5]:
kfScoring=[]
kf = KFold(n_splits=10, shuffle=True, random_state=seed)

#### The training of the random forest using K-fold cross validation along with the outputted accuracies. 

In [7]:
for k, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rf.fit(X_train, y_train)
    res = rf.score(X_test, y_test)
    kfScoring.append(res)
    
    print("Fold %d: Accuracy: %.3f" % (k+1, res))

print("\nThe fitting of the model to the data: %.3f " % rf.score(X_train, y_train))
print("\nAveraged Accuracy: %.3f \nStandard Deviation: %.3f" % (np.mean(kfScoring), np.std(kfScoring)))

Fold 1: Accuracy: 0.982
Fold 2: Accuracy: 1.000
Fold 3: Accuracy: 0.965
Fold 4: Accuracy: 0.982
Fold 5: Accuracy: 0.930
Fold 6: Accuracy: 0.930
Fold 7: Accuracy: 0.982
Fold 8: Accuracy: 0.947
Fold 9: Accuracy: 0.982
Fold 10: Accuracy: 0.893

The fitting of the model to the data: 1.000 

Averaged Accuracy: 0.959 
Standard Deviation: 0.032


## Stratified K-fold cross validation
+ skf = the stratified k fold algorithm
+ n_split = the number of splits, in this case there are 10 splits
+ shuffle = data is shuffled when set to True
+ skfScoring = the list of accuracies in each fold

In [8]:
skfScoring=[]
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

#### The training of the random forest using stratified K-fold cross validation along with the outputted accuracies.

In [9]:
for k, (train_index, test_index) in enumerate(skf.split(X,y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rf.fit(X_train, y_train)
    res = rf.score(X_test, y_test)
    skfScoring.append(res)
    
    print("Fold %d: Accuracy: %.3f" % (k+1, res))

print("\nThe fitting of the model to the data: %.3f " % rf.score(X_train, y_train))
print("\nAveraged Accuracy: %.3f \nStandard Deviation: %.3f" % (np.mean(skfScoring), np.std(skfScoring)))

Fold 1: Accuracy: 0.930
Fold 2: Accuracy: 0.947
Fold 3: Accuracy: 0.947
Fold 4: Accuracy: 0.965
Fold 5: Accuracy: 1.000
Fold 6: Accuracy: 0.982
Fold 7: Accuracy: 0.947
Fold 8: Accuracy: 0.982
Fold 9: Accuracy: 0.965
Fold 10: Accuracy: 1.000

The fitting of the model to the data: 1.000 

Averaged Accuracy: 0.967 
Standard Deviation: 0.023


## Repeated Stratified K-Fold Cross Validation
+ rskf = the repeated stratified k fold algorithm
+ n_split = the number of splits, in this case there are 10 splits
+ n_repeats = the number of time the approach is repeated
+ rskfScoring = the list of accuracies in each fold

In [10]:
rskfScoring = []
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)

In [11]:
for k, (train_index, test_index) in enumerate(rskf.split(X,y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rf.fit(X_train, y_train)
    res = rf.score(X_test, y_test)
    rskfScoring.append(res)
    
    print("Fold %d: Accuracy: %.3f" % (k+1, res))

print("\nThe fitting of the model to the data: %.3f " % rf.score(X_train, y_train))
print("\nAveraged Accuracy: %.3f \nStandard Deviation: %.3f" % (np.mean(rskfScoring), np.std(rskfScoring)))

Fold 1: Accuracy: 0.930
Fold 2: Accuracy: 0.947
Fold 3: Accuracy: 0.947
Fold 4: Accuracy: 0.965
Fold 5: Accuracy: 1.000
Fold 6: Accuracy: 0.982
Fold 7: Accuracy: 0.947
Fold 8: Accuracy: 0.982
Fold 9: Accuracy: 0.965
Fold 10: Accuracy: 1.000
Fold 11: Accuracy: 1.000
Fold 12: Accuracy: 0.982
Fold 13: Accuracy: 0.912
Fold 14: Accuracy: 0.912
Fold 15: Accuracy: 0.965
Fold 16: Accuracy: 0.982
Fold 17: Accuracy: 0.982
Fold 18: Accuracy: 0.965
Fold 19: Accuracy: 0.982
Fold 20: Accuracy: 0.964
Fold 21: Accuracy: 0.965
Fold 22: Accuracy: 0.947
Fold 23: Accuracy: 0.982
Fold 24: Accuracy: 0.947
Fold 25: Accuracy: 0.965
Fold 26: Accuracy: 0.947
Fold 27: Accuracy: 0.930
Fold 28: Accuracy: 0.965
Fold 29: Accuracy: 0.982
Fold 30: Accuracy: 0.982

The fitting of the model to the data: 1.000 

Averaged Accuracy: 0.964 
Standard Deviation: 0.023
