In [134]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# This module avoid error warning while LogisticRegression running 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [135]:
digits = load_digits()
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3)

In [136]:
pipeline_lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
pipeline_lr.fit(x_train, y_train)
pipeline_lr.score(x_test, y_test)

0.9611111111111111

In [113]:
svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

0.9944444444444445

In [114]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9740740740740741

## <span style="color: red;">KFold Cross-Validation</span>
### K-Fold cross-validation is a technique that involves splitting the dataset into k smaller sets (folds). The model is trained on k-1 of these folds and tested on the remaining one fold. This process is repeated k times, with each fold being used exactly once as the test set. The results are then averaged to produce a single performance metric.

In [137]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

#### Example of KFold

In [138]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9,10]):
    print(train_index, test_index)

[4 5 6 7 8 9] [0 1 2 3]
[0 1 2 3 7 8 9] [4 5 6]
[0 1 2 3 4 5 6] [7 8 9]


In [139]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [108]:
# get_score(pipeline_lr, x_train, x_test, y_train, y_test)

In [117]:
get_score(SVC(), x_train, x_test, y_train, y_test)

0.9944444444444445

In [118]:
get_score(RandomForestClassifier(), x_train, x_test, y_train, y_test)

0.9833333333333333

### <span style="color:red">StratifiedKFold</span>
#### Stratified k-fold cross-validation is an enhancement over regular k-fold cross-validation that ensures each fold is representative of the whole dataset. Specifically, it maintains the proportion of each class in every fold, making it particularly useful for imbalanced datasets.

In [119]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [140]:
score_lr = []
score_svm = []
score_rf = []
for train_index, test_index in kf.split(digits.data):
    x_train, x_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                        digits.target[train_index], digits.target[test_index]
    score_lr.append(get_score(pipeline_lr, x_train, x_test, y_train, y_test))
    score_svm.append(get_score(SVC(), x_train, x_test, y_train, y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=90), x_train, x_test, y_train, y_test))

In [141]:
score_lr

[0.9248747913188647, 0.9398998330550918, 0.9265442404006677]

### <span style="color:red;">cross_val_score</span>
####  It automates the process of splitting the dataset into multiple folds, training the model on each subset, and computing evaluation metrics. This provides a robust way to assess how well the model is likely to generalize to an independent dataset.

In [142]:
from sklearn.model_selection import cross_val_score

In [148]:
cross_val_score(pipeline_lr, digits.data, digits.target)

array([0.91388889, 0.88055556, 0.94428969, 0.9637883 , 0.89693593])

In [144]:
cross_val_score(SVC(), digits.data, digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [145]:
cross_val_score(RandomForestClassifier(), digits.data, digits.target)

array([0.93611111, 0.925     , 0.95543175, 0.9637883 , 0.92200557])