Interesting lecture: https://machinelearningmastery.com/k-fold-cross-validation/

In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
import os
import math
from sklearn import preprocessing
import warnings
from DataGenerator import ImgListDataGen
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

Using TensorFlow backend.


### First, it is important to understand differences between ShuffleSplit and KFold

In [2]:
splits = 5

tx = range(10)
ty = [0] * 5 + [1] * 5

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import datasets

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)

print("KFold")
for train_index, test_index in kfold.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)

print("Shuffle Split")
for train_index, test_index in shufflesplit.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)

KFold
TRAIN: [0 2 3 4 5 7 8 9] TEST: [1 6]
TRAIN: [0 1 2 3 5 6 7 8] TEST: [4 9]
TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
TRAIN: [0 1 2 4 5 6 7 9] TEST: [3 8]
Shuffle Split
TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]


In [3]:
# In KFolds, each test set should not overlap, even with shuffle. 
# With KFolds and shuffle, the data is shuffled once at the start, and then divided into the number of desired splits. 
# The test data is always one of the splits, the train data is the rest.

# In ShuffleSplit, the data is shuffled every time, and then split. 
# This means the test sets may overlap between the splits:
#                                Test, first row 3 and third row 3, first row 9 and fourth row 9.

In [4]:
# Thus, in ShuffleSplit test_size can be specified, for instance .2 means 1-.2 = .8 for training.
# While, KFolds depends on the size of the data and K (test set should not overlap), thus, if data is len 10, 
# and K is 5, then test size is 10/5=2 in order to never overlap.

### Second, let's create the K-Fold cross validation

In [5]:
DB_Path = '/Users/dfreire/Dropbox/Datasets/small_dataset/train'

In [6]:
def Load_KFold(DB_path, k=2):
    #Numeric labels
    lab = preprocessing.LabelEncoder()
    lab.fit(os.listdir(DB_path))
    labels = lab.transform(os.listdir(DB_path))
    inv = lab.inverse_transform(labels)
    class_dict = dict(zip(inv, labels))
    print(class_dict)
    #inv_class_dict = {k: v for v,k in class_dict.items()} to decode labels if necessary

    #Read each folder
    data=[]
    labels=[]
    for class_ in os.listdir(DB_path):
        dat = [os.path.join(DB_path, class_, img) for img in os.listdir(os.path.join(DB_path, class_))]
        lab = np.ones(len(dat))*class_dict[class_]
        
        labels = np.concatenate((labels,lab))
        data = data + dat
    
    print(len(data))
    print(len(labels))
    folds = list(StratifiedKFold(n_splits=k, shuffle=True, random_state=1).split(data, labels))
    
    return folds, np.array(data), labels.astype(int)

In [7]:
k=4
folds, data, labels = Load_KFold(DB_path=DB_Path, k=k)

{'cats': 0, 'dogs': 1}
2000
2000


In [8]:
print('There are {} Folds'.format(len(folds)))
print('Train data contains {} samples'.format(len(folds[0][0])))
print('Test data contains {} samples'.format(len(folds[0][1])))
print('Test data samples are aprox obtained from {}'.format(math.ceil(len(data)/k)))

There are 4 Folds
Train data contains 1500 samples
Test data contains 500 samples
Test data samples are aprox obtained from 500


In [9]:
#Let's check the test set of the first fold
unique, counts = np.unique(labels[folds[0][1]], return_counts=True)
dict(zip(unique, counts))
#250 items of class 0 and 250 of class 1

{0: 250, 1: 250}

In [10]:
for j, (train_idx, val_idx) in enumerate(folds):
    
    print('\nFold ',j)
    
    X_train_cv = data[train_idx]
    y_train_cv = labels[train_idx]
    X_valid_cv = data[val_idx]
    y_valid_cv= labels[val_idx]
    
    print(X_train_cv[:3])


Fold  0
['/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.0.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.1.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.101.jpg']

Fold  1
['/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.1.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.10.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.100.jpg']

Fold  2
['/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.0.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.10.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.100.jpg']

Fold  3
['/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.0.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.1.jpg'
 '/Users/dfreire/Dropbox/Datasets/small_dataset/train/cats/cat.10.jpg']


The general procedure is as follows:

Shuffle the dataset randomly. ok
Split the dataset into k groups ok
For each unique group:
Take the group as a hold out or test data set
Take the remaining groups as a training data set
Fit a model on the training set and evaluate it on the test set
Retain the evaluation score and discard the model
Summarize the skill of the model using the sample of model evaluation scores

In [11]:
from keras import Input, optimizers
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from keras.models import Model

In [24]:
def get_model():
    
    entrada= Input(shape=(150,150,3))
    
    conv = Conv2D(filters=32, kernel_size=3, activation='relu', name='conv_1')(entrada)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_1')(conv)
    
    conv = Conv2D(filters=64, kernel_size=3, activation='relu', name='conv_2')(maxpool)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_2')(conv)   
    
    conv = Conv2D(filters=128, kernel_size=3, activation='relu', name='conv_3')(maxpool)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_3')(conv)
        
    conv = Conv2D(filters=128, kernel_size=3, activation='relu', name='conv_4')(maxpool)
    maxpool = MaxPool2D(pool_size=2, strides=2, name='maxpool_4')(conv)
    
    flat = Flatten(name='flatten')(maxpool)
    drop = Dropout(rate=.5, name='dropout')(flat)
    
    dense = Dense(units=512, activation='relu', name='Dense1')(drop)
    output = Dense(units=1, activation='sigmoid', name='output')(dense)
    #output = Dense(units=2, activation='softmax', name='output')(dense)
    
    model = Model(entrada, output)
    
    #model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=['acc'])
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
    
    return model

In [21]:
batch_size=20
# Parameters
train_params = {
    'rescale': 1./255,
    'batch_size': 100,
    'n_classes': 2, 
    'target_size': (150,150,3),
    'aug_mode': 'ShiftScaleRotate',
    'class_mode': 'binary' #'categorical',
    'shuffle': True}
val_params = {
    'rescale': 1,#1./255,
    'batch_size': 100,
    'n_classes': 2, 
    'target_size': (150,150,3),
    'aug_mode': None,
    'class_mode': 'binary' #'categorical',
    'shuffle': True}

In [None]:
for j, (train_idx, val_idx) in enumerate(folds):
    
    print('\nFold ',j)
    X_train_cv = data[train_idx]
    y_train_cv = labels[train_idx]
    X_valid_cv = data[val_idx]
    y_valid_cv= labels[val_idx]
    
    #name_weights = "final_model_fold" + str(j) + "_weights.h5"
    #callbacks = get_callbacks(name_weights = name_weights, patience_lr=10)
    training_gen = ImgListDataGen(img_files = X_train_cv, labels=y_train_cv, **train_params)
    validation_gen = ImgListDataGen(img_files = X_valid_cv, labels=y_valid_cv, **val_params) 
    print('Training')                                    
    model = get_model()
    model.fit_generator(
                training_gen,
                steps_per_epoch=len(X_train_cv)/batch_size,
                epochs=10,
                shuffle=True,
                verbose=1,
                validation_data = validation_gen)#,
                #callbacks = callbacks)
    
    print(model.evaluate(X_valid_cv, y_valid_cv))

In [None]:
#Inverse from to_categorical

In [24]:
from numpy.random import randint
from numpy import argmax
from keras.utils.np_utils import to_categorical
k = 8
n = 10
x = randint(0, k, (n,))
print(x)
cat = to_categorical(x, k)
print(cat)
inv = np.array([val for val in [argmax(i) for i in cat]]) ###
print(inv)

[2 1 4 4 2 5 3 1 2 7]
[[0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]
[2 1 4 4 2 5 3 1 2 7]


In [None]:
#Small generator example
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import keras


model = get_model()

# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.

class CIFAR10Sequence(keras.utils.Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        k=np.array([resize(imread(file_name), (150, 150, 3)) for file_name in batch_x])
        #print(k.shape)
        #print(batch_y)

        return np.array([resize(imread(file_name), (150, 150, 3)) for file_name in batch_x]), np.array(batch_y)
    
    
    
Xtrain_gen = CIFAR10Sequence(data,labels,batch_size=10) # you can choose your batch size.
#Xvalidation_gen = detracSequence(X_validation,y_validation,batch_size=512)

model.fit_generator(generator=Xtrain_gen, epochs=10)