In [None]:
# Unbalanced Data set
# https://www.youtube.com/watch?v=iCxtl44NMek&ab_channel=AnalyticswithAdam

# Hold Out Cross Validation

In [1]:
# Cross Validation is a method to estimate the performance of a machine learning model.
# It is a technique for evaluating a machine learning model and testing its performance.
# We use training data set and split it into smaller ones.

# Cross-Validation also referred to as out of sampling technique is an essential element of a data science project. 
# It is a resampling procedure used to evaluate machine learning models and access how the model will perform for 
# an independent test dataset.

# While the primary significance of cross-validation is to validate a model and test its accuracy, there are more factors 
# that make this method important. 

# More than validating the model and testing its accuracy, cross-validation is used for measuring overfitting and any other 
# errors that might be noticed while testing the model. Overfitting refers to a concept wherein a statistical model works 
# against its training data set and does not show accuracy as expected. 

In [None]:
# Splitting the main dataset in train and test sets. We do training on the training set, and validation on the train set.
# Advantages: quick to execute
# Disadvantage: not suitable for unbalanced dataset, not suitable for small datasets

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

log = LogisticRegression()  
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

log.fit(x_train,y_train)

y_pred = log.predict(x_test)

print(f"Accuracy on training dataset is : {accuracy_score(y_train,log.predict(x_train))}")
print(f"Accuracy on test dataset is : {accuracy_score(y_test,log.predict(x_test))}")

Accuracy on training dataset is : 0.9619047619047619
Accuracy on test dataset is : 1.0


# K-Fold Cross Validation Technique

In [3]:
# Partition the whole dataset into k parts. 
# Call each partition as a fold
# Use 1 fold for Validation
# Use k-1 fold for training purpose
# This technique is repeated k times, until each fold is used as a validation data, and remaining folds used as a training data.
# The final accuracy of the model is computed by taking the mean accuracy of k models validation data
# Pros: the entire dataset is is actually used as training set and validation set
# Cons: it shouldn't be used on unbalanced dataset (eg only class 0 will be presented in the training data set, no class 1)

In [4]:
from sklearn.model_selection import cross_val_score, KFold

In [8]:
kf = KFold(n_splits=5)

score = cross_val_score(log,X,y,cv=kf)

kf = KFold(n_splits=5)

score = cross_val_score(log,X,y,cv=kf)

print(f"Cross Validation scores are: {score}")    # for each split
print(f"Average Cross Validation scores is: {score.mean()}")   # mean of 5 splits

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross Validation scores are: [1.         1.         0.86666667 0.93333333 0.83333333]
Average Cross Validation scores is: 0.9266666666666665


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Stratified K-Fold Cross Validation

In [9]:
# Used for unbalanced dataset
# Not suitable for Time Series data

In [10]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]


# Leave-p-out cross validation technique

In [None]:
# p samples are used for validation set and the remaining N minus P samples are used as training set
# If I have 100 samples in my dataset, I use 10 as p, then in each iteration 10 values will be used as validation dataset,
# and the remaining 90 samples will be used as my training dataset.
# This process is repeated till the whole data set gets divided on the validation set of P samples and 
# N minus P training samples
# Pros: all the data samples get used, both training and the validation samples
# Cons: hihg computation time and not suitable for imbalanced datasets

In [11]:
import numpy as np
from sklearn.model_selection import LeavePOut

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
lpo = LeavePOut(2)

print (lpo)

for train_index, test_index in lpo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

LeavePOut(p=2)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 1] TEST: [2 3]


# Leave one out Cross Validation

In [12]:
import numpy as np
from sklearn.model_selection import LeaveOneOut

X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()

print(loo)

for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


LeaveOneOut()
TRAIN: [1] TEST: [0]
TRAIN: [0] TEST: [1]
