# Cross-validation: evaluating estimator performance 
* Learning the parameters of a prediction function and testing it on the same data is a methodological mistake
* when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally.
* The best parameters can be determined by [grid search](https://scikit-learn.org/stable/modules/grid_search.html#grid-search) techniques.
* In scikit-learn a random split into training and test sets can be quickly computed with the [train_test_split helper](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split) function


[source](https://scikit-learn.org/stable/modules/cross_validation.html)

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets

X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [2]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [47]:
# ****** HERE
import pandas
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import numpy as np

X, y = datasets.load_iris(return_X_y=True)

scores = []
# clf = SVR(kernel='rbf')
# clf = DecisionTreeClassifier(random_state=0)
clf = svm.SVC(kernel='linear', C=1, random_state=42)
cv = KFold(n_splits=5, random_state=42, shuffle=False)
for train_index, test_index in cv.split(X):
    # print("Train Index: ", train_index, "\n")
    # print("Test Index: ", test_index)
    # print("-----")

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    model = clf.fit(X_train, y_train)
    y_pred = model.predict(X_test) 

    accuracy = accuracy_score(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred,average='micro')
    f1_macro = f1_score(y_test, y_pred,average='macro')
    scores.append([f1_micro,f1_macro,accuracy])

results = np.array(scores)
mean = np.mean(results, axis=0)
std = np.std(results, axis=0)

print("f1_micro =                 ", round(mean[0]*100,2)," +- ", round(std[0]*100,2))
print("f1_macro =                 ", round(mean[1]*100,2)," +- ", round(std[1]*100,2))
print("acc =                      ", round(mean[2]*100,2)," +- ", round(std[2]*100,2))


f1_micro =                  94.67  +-  6.53
f1_macro =                  78.57  +-  26.24
acc =                       94.67  +-  6.53
