# Testing sklearn functions on IRIS data set 

In [19]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
import pandas as pd
import numpy as np 


In [16]:
iris = datasets.load_iris()

datas = iris.data
feature_names = iris.feature_names # 0:setosa, 1: versicolor, 2: virginica
iris_df = pd.DataFrame(datas, columns=feature_names)

iris_df['target'] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [33]:
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((120, 4), (30, 4))

In [48]:
classifier = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
classifier.score(X_test, y_test)

1.0

In [239]:
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=3)
scores

array([1.  , 1.  , 0.98])

In [240]:
scores.mean(), scores.std()

(0.9933333333333333, 0.009428090415820642)

$99.3\%$ accuracy with a standard deviation $<1\%$.

In [74]:
from sklearn import metrics

scores = cross_val_score(clf, X, y, cv=3, scoring='f1_macro')
scores

array([1.        , 1.        , 0.98037518])

In [75]:
scores.mean(), scores.std()

(0.9934583934583935, 0.009251228690848491)

In [83]:
def custom_cv_2folds(X):
    n = X.shape[0]
    i = 1
    while i <= 2:
        idx = np.arange(n * (i - 1) / 2, n * i / 2, dtype=int)
        yield idx, idx
        i += 1
    
custom_cv = custom_cv_2folds(X)
cross_val_score(clf, X, y, cv=custom_cv)


array([1.        , 0.97333333])

In [92]:
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_trasnformed = scaler.transform(X_test)
clf.score(X_test_trasnformed, y_test)

1.0

In [121]:
from sklearn.pipeline import make_pipeline 
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, X, y, cv=3).mean()


0.9666666666666667

In [139]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, precision_score
scoring = ['precision_macro', 'recall_macro', 'accuracy']
clf = svm.SVC(kernel='linear', C=1, random_state=1)
scores = cross_validate(clf, X, y, scoring=scoring, cv=3, return_estimator=True, return_train_score=True)
scores['test_accuracy'].mean(), scores['train_accuracy'].mean(), scores['test_precision_macro'].mean(), scores['test_recall_macro'].mean()

(0.9933333333333333,
 0.9899999999999999,
 0.9938271604938271,
 0.9934640522875817)

In [143]:
from sklearn.model_selection import cross_val_predict

clf = svm.SVC(kernel='linear', C=1, random_state=42)
cross_val_predict(clf, X, y, cv=3)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [230]:
from sklearn.model_selection import LeaveOneOut

def leave_one_out(X, y):
    loo = LeaveOneOut()
    scores = []
    for (train_idxs, test_idxs) in loo.split(X, y):
        new_X = [X[i] for i in train_idxs]
        new_Y = [y[i] for i in train_idxs]
        clf = svm.SVC(kernel='linear', C=1).fit(new_X, new_Y)
        scores.append(clf.score(X[test_idxs], y[test_idxs]))

    print(np.array(scores).mean())

leave_one_out(X, y)

0.98


In [241]:
from sklearn.model_selection import GridSearchCV, KFold

# searching the best params and best evaluation using kernel rbf
param_grid = {'C': np.arange(1, 101, 10), 'gamma': np.arange(0.001, 0.1, 0.010)}
model = svm.SVC()

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv)
grid_search.fit(X, y)
nested_score = cross_val_score(grid_search, X=X, y=y, cv=outer_cv)

nested_score.mean(), grid_search.best_params_

(0.9800000000000001, {'C': 51, 'gamma': 0.020999999999999998})

In [255]:
param_grid = {'C': np.arange(1, 101, 10)}
model = svm.SVC(kernel='linear')

outer_cv = KFold(n_splits=30, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv)
grid_search.fit(X, y)
nested_score = cross_val_score(grid_search, X=X, y=y, cv=outer_cv)

nested_score.mean(), grid_search.best_params_

(0.98, {'C': 1})