Interesting lecture: https://machinelearningmastery.com/k-fold-cross-validation/

In [1]:
import sys
sys.path.insert(0, '../') #to load KFold

In [2]:
from keras import Input, optimizers
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from keras.models import Model
from KFold import K_Fold

Using TensorFlow backend.


### First, it is important to understand differences between ShuffleSplit and KFold

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold

splits = 5

tx = range(10)
ty = [0] * 5 + [1] * 5

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)

print("KFold")
for train_index, test_index in kfold.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)

print("Shuffle Split")
for train_index, test_index in shufflesplit.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)

KFold
TRAIN: [0 2 3 4 5 7 8 9] TEST: [1 6]
TRAIN: [0 1 2 3 5 6 7 8] TEST: [4 9]
TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
TRAIN: [0 1 2 4 5 6 7 9] TEST: [3 8]
Shuffle Split
TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]


In [3]:
# In KFolds, each test set should not overlap, even with shuffle. 
# With KFolds and shuffle, the data is shuffled once at the start, and then divided into the number of desired splits. 
# The test data is always one of the splits, the train data is the rest.

# In ShuffleSplit, the data is shuffled every time, and then split. 
# This means the test sets may overlap between the splits:
#                                Test, first row 3 and third row 3, first row 9 and fourth row 9.

In [4]:
# Thus, in ShuffleSplit test_size can be specified, for instance .2 means 1-.2 = .8 for training.
# While, KFolds depends on the size of the data and K (test set should not overlap), thus, if data is len 10, 
# and K is 5, then test size is 10/5=2 in order to never overlap.

### Second, let's create the K-Fold cross validation

In [4]:
DB_Path = '/Users/dfreire/Dropbox/Datasets/small_dataset/train'

The general procedure is as follows:

Shuffle the dataset randomly. ok
Split the dataset into k groups ok
For each unique group:
Take the group as a hold out or test data set
Take the remaining groups as a training data set
Fit a model on the training set and evaluate it on the test set
Retain the evaluation score and discard the model
Summarize the skill of the model using the sample of model evaluation scores

In [5]:
#To save history dict
import pickle        
def save_obj(obj, name):
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [6]:
def get_model():
    
    entrada= Input(shape=(150,150,3))
    
    conv = Conv2D(filters=32, kernel_size=3, activation='relu', name='conv_1')(entrada)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_1')(conv)
    
    conv = Conv2D(filters=64, kernel_size=3, activation='relu', name='conv_2')(maxpool)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_2')(conv)   
    
    conv = Conv2D(filters=128, kernel_size=3, activation='relu', name='conv_3')(maxpool)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_3')(conv)
        
    conv = Conv2D(filters=128, kernel_size=3, activation='relu', name='conv_4')(maxpool)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_4')(conv)
    
    flat = Flatten(name='flatten')(maxpool)
    #drop = Dropout(rate=.5, name='dropout')(flat)
    
    dense = Dense(units=512, activation='relu', name='Dense1')(flat)#(drop)
    output = Dense(units=1, activation='sigmoid', name='output')(dense)
    #output = Dense(units=2, activation='softmax', name='output')(dense)
    
    model = Model(entrada, output)
    
    #model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=['acc'])
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
    
    return model

In [7]:
#Generator parameters
traingen_params = {
    'rescale': 1./255
}

testgen_params = {
    'rescale': 1./255
}

# Train parameters
train_params = {
    'batch_size': 20,
    'target_size': (150,150),
    'class_mode': 'binary', #'categorical',
}

test_params = {
    'batch_size': 20,
    'target_size': (150,150),
    'class_mode': 'binary', #'categorical',
}

fit_params = {
    'epochs':10,
    'shuffle':True,
    'verbose':1
}

In [8]:
KF = K_Fold(DB_Path, 4)

In [9]:
model = get_model()

In [10]:
KF.Check_Folds()

There are 4 Folds

Fold  0
For training, 1500 samples: {'cats': 750, 'dogs': 750}
For testing, 500 samples: {'cats': 250, 'dogs': 250}
First five X_train images: 
['/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.0.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.1.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.10.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.100.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.101.jpg']
First five X_val images: 
['/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.104.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.106.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.108.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.117.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.12.jpg']

Fold  1
For training, 1500 samples: {'cats': 750, 'dogs': 750}
For testing, 500 samples: {'cat

In [6]:
hist = KF.Apply_KFold(model, traingen_params,  testgen_params, train_params, test_params, fit_params)


Fold  0
Found 1500 images belonging to 2 classes.
Found 500 images belonging to 2 classes.
Training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:
for key, val in hist_dict.items():
    print('{0}: {1}'.format(key, np.mean(np.array(val))))

In [None]:
save_obj(hist_dict,'hist_config1.pkl')

In [None]:
hist_retr = load_obj('hist_config1.pkl')

In [None]:
for key, val in hist_retr.items():
    print('{0}: {1}'.format(key, np.mean(np.array(val))))