In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [11]:
from sklearn.datasets import load_digits

digits = load_digits()

X = digits.data
y = digits.target

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression()
modelLR.fit(X_train, y_train)
modelLR.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9583333333333334

In [14]:
from sklearn.tree import DecisionTreeClassifier

modelDT = DecisionTreeClassifier()
modelDT.fit(X_train, y_train)
modelDT.score(X_test, y_test)

0.8583333333333333

In [15]:
from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier()
modelRF.fit(X_train, y_train)
modelRF.score(X_test, y_test)

0.9916666666666667

## In the above examples we are trying different models to see which is best suited for our situation

## Also, if you keep running it repeatedly, the X_train, X_test, y_train, y_test values keep changing continuosly which causes variation in the score. Hence we can't determine accuracy ( score) just by running it once

# Demonstrating K Fold API:

In [3]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)                       # you can specify how many folds you want to create
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [5]:
# this takes the list to perform split on as the arguements

for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]): 
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [6]:
# now well write a function, generalizing the model that we use:

In [7]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [19]:
#example of using the above method:
from sklearn.svm import SVC

print(get_score(SVC(), X_train, X_test, y_train, y_test))

0.9916666666666667


In [20]:
# now lets use it on out dataset

### StratifiedKFold (is similiar to KFold), but its better because it divides each of the categories uniformly

In [30]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [33]:
scores_lr = []                                   # array to keep logistic regression scores
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    
    scores_lr.append(get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

In [32]:
scores_lr

[0.9215358931552587, 0.9415692821368948, 0.9165275459098498]

In [34]:
scores_svm

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [35]:
scores_rf

[0.9332220367278798, 0.9599332220367279, 0.9198664440734557]

In [36]:
# now we can take mean and decide which model is best suited for us

In [56]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10)

scores_lr = []                                  
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    
    scores_lr.append(get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

In [43]:
scores_lr

[0.9055555555555556,
 0.9611111111111111,
 0.8777777777777778,
 0.9277777777777778,
 0.9444444444444444,
 0.9666666666666667,
 0.95,
 0.9385474860335196,
 0.8715083798882681,
 0.9385474860335196]

In [46]:
scores_svm


[0.9444444444444444,
 0.9888888888888889,
 0.9277777777777778,
 0.9666666666666667,
 0.9833333333333333,
 0.9888888888888889,
 0.9888888888888889,
 0.994413407821229,
 0.9608938547486033,
 0.9553072625698324]

In [45]:
scores_rf

[0.9,
 0.9777777777777777,
 0.9222222222222223,
 0.9333333333333333,
 0.9666666666666667,
 0.9722222222222222,
 0.9722222222222222,
 0.9720670391061452,
 0.9273743016759777,
 0.9217877094972067]

In [50]:
def mean(arr):
    return sum(arr)/len(arr)

In [53]:
mean(scores_lr)

0.9281936685288642

In [54]:
mean(scores_svm)

0.9699503414028554

In [55]:
mean(scores_rf)

0.956030415890751

In [57]:
# hence svm wins for this case

### Above process can be easily performed by using cross_val_score which takes model, x, y as arguements

In [68]:
from sklearn.model_selection import cross_val_score

cross_val_score(SVC(), digits.data, digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

# Exercise

## To find the best model for iris flower set

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
iris = load_iris()

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [75]:
from sklearn.model_selection import cross_val_score

In [None]:
def mean(l):
    return sum(l)/len(l)

In [79]:
score_lr = cross_val_score(LogisticRegression(), iris.data, iris.target)
print(mean(score_lr))

0.9733333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [80]:
score_svm = cross_val_score(SVC(), iris.data, iris.target)
print(mean(score_svm))

0.9666666666666666


In [81]:
score_dt = cross_val_score(DecisionTreeClassifier(), iris.data, iris.target)
print(mean(score_dt))

0.9533333333333334


In [83]:
score_rf = cross_val_score(RandomForestClassifier(), iris.data, iris.target)
print(mean(score_rf))

0.96
