In [1]:
##############################
###   RESAMPLING METHODS   ###
##############################

### TRAIN/TEST SPLIT

from random import seed, randrange

# Split a dataset into train/test
def train_test_split(dataset, split=0.60):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

#  seed(1)
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13]]

train, test = train_test_split(dataset)
print(train)
print(test)

In [2]:
### K-FOLD CROSS VALIDATION SPLIT

# value of K should be divisible by the number of rows in training dataset
# to ensure that K-groups has the same number of rows. K should be chosen 
# so that each group will still be representative of the original dataset.
# 3 is a good default for small datasets and 10 is good for large dataset.

def cross_validation_split(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for _ in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

folds = cross_validation_split(dataset)
print(folds)

In [3]:
### Using Train/Test Split for Diabetes Dataset

import pandas as p
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
filename = 'pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = p.read_csv(filename, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result*100.0))


Accuracy: 78.740%


In [4]:
#### Using Cross Validation for Diabetes Dataset
## The first 10 lines of code in cell 3 are the same for this cell

kfold = model_selection.KFold(n_splits=10, 
                              random_state=seed, 
                              shuffle=True
                             )
model = LogisticRegression(max_iter=1000)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, 
                                     results.std()*100.0))



Accuracy: 77.216% (4.968%)


In [5]:
#### Leave Out One Cross Validation
## The first 9 lines of code in cell 3 are the same for this cell.

num_folds = 10
num_instances = len(X)
loocv = model_selection.LeaveOneOut()
model = LogisticRegression(max_iter=1000)
results = model_selection.cross_val_score(model, X, Y, cv=loocv)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, 
                                     results.std()*100.0))



Accuracy: 77.604% (41.689%)


In [6]:
#### Repeated Random Test-Train Splits
## The first 11 lines of code in cell 3 are the same for this cell

num_instances = len(X)
kfold = model_selection.ShuffleSplit(n_splits=10, 
                                     test_size=test_size, 
                                     random_state=seed
                                    )
model = LogisticRegression(max_iter = 1000)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, 
                                     results.std()*100.0))

Accuracy: 76.535% (2.235%)


In [7]:
#### What Techniques to Use And When

## Generally k-fold cross validation is the gold-standard for evaluating 
#  the performance of a machine learning algorithm on unseen data with 
#  k set to 3, 5, or 10.

## Using a train/test split is good for speed when using a slow algorithm 
#  and produces performance estimates with lower bias when using large 
#  datasets.

## Techniques like leave-one-out cross validation and repeated random 
#  splits can be useful intermediates when trying to balance variance 
#  in the estimated performance, model training speed and dataset size.


## The best advice is to experiment and find a technique for your problem 
#  that is fast and produces reasonable estimates of performance that you 
#  can use to make decisions. If in doubt, use 10-fold cross validation.
