In [2]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits

In [3]:
digit = load_digits()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digit.data,digit.target,test_size=0.2)

Support Vector Machine

In [4]:
svm_model = SVC(gamma='auto')
svm_model.fit(x_train,y_train)
svm_model.score(x_test,y_test)

0.5055555555555555

Logistic Regression

In [5]:
lr_model = LogisticRegression(solver='liblinear',multi_class='ovr')
lr_model.fit(x_train,y_train)
lr_model.score(x_test,y_test)

0.9527777777777777

Random Forest

In [6]:
rf_model = RandomForestClassifier()
rf_model .fit(x_train,y_train)
rf_model.score(x_test,y_test)

0.975

# K-Cross Validation 

Basic Example

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [8]:
for train_index,test_index in kf.split([1,2,3,4,5,6,7,8,9,0]):
    print(train_index,test_index)

[4 5 6 7 8 9] [0 1 2 3]
[0 1 2 3 7 8 9] [4 5 6]
[0 1 2 3 4 5 6] [7 8 9]


K-Fold for our digits dataset

In [9]:
def get_score (model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test,y_test)

In [11]:
from sklearn.model_selection import StratifiedKFold
kfs = StratifiedKFold(n_splits=3)

score_lr = []
score_svm = []
score_rf = []

for train_index,test_index in kfs.split(digit.data, digit.target):
    x_train,x_test,y_train,y_test = digit.data[train_index], digit.data[test_index], digit.target[train_index], digit.target[test_index]

    score_lr.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), x_train,x_test,y_train,y_test ))
    score_svm.append(get_score(SVC(gamma='auto'), x_train,x_test,y_train,y_test ))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=40), x_train,x_test,y_train,y_test ))

In [12]:
score_lr

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]

In [13]:
score_svm

[0.3806343906510851, 0.41068447412353926, 0.5125208681135225]

In [14]:
score_rf

[0.9232053422370617, 0.9482470784641068, 0.9348914858096828]

## Cross Validation score Function

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
cross_val_score (LogisticRegression(solver='liblinear',multi_class='ovr'), digit.data, digit.target, cv=3)

array([0.89482471, 0.95325543, 0.90984975])

In [17]:
cross_val_score (SVC(gamma='auto'), digit.data, digit.target,cv=3)

array([0.38063439, 0.41068447, 0.51252087])

In [18]:
cross_val_score (RandomForestClassifier(n_estimators=40), digit.data, digit.target, cv= 3)

array([0.93489149, 0.94991653, 0.91986644])

### Exercise
Use iris flower dataset from sklearn library and use cross_val_score against following models to measure the performance of each. In the end figure out the model with best performance,

1. Logistic Regression

2. SVM

3. Decision Tree

4. Random Forest

In [20]:
from sklearn.datasets import load_iris
flower = load_iris()

In [28]:
np.average(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'),flower.data,flower.target,cv=5))

0.9600000000000002

In [29]:
np.average(cross_val_score(SVC(gamma='auto'),flower.data,flower.target,cv=5))

0.9800000000000001

In [30]:
from sklearn import tree
np.average(cross_val_score(tree.DecisionTreeClassifier(),flower.data,flower.target,cv=5))

0.9600000000000002

In [31]:
np.average(cross_val_score(RandomForestClassifier(n_estimators=30),flower.data,flower.target,cv=5))

0.9466666666666667

We can say SVM would be the best model to work with for this dataset.