In [40]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
import os
from sklearn import preprocessing
import warnings
from DataGenerator import ImgListDataGen
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

### First, it is important to understand differences between ShuffleSplit and KFold

In [49]:
splits = 5

tx = range(10)
ty = [0] * 5 + [1] * 5

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import datasets

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)

print("KFold")
for train_index, test_index in kfold.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)

print("Shuffle Split")
for train_index, test_index in shufflesplit.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)

KFold
TRAIN: [0 2 3 4 5 7 8 9] TEST: [1 6]
TRAIN: [0 1 2 3 5 6 7 8] TEST: [4 9]
TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
TRAIN: [0 1 2 4 5 6 7 9] TEST: [3 8]
Shuffle Split
TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]


In [None]:
# In KFolds, each test set should not overlap, even with shuffle. 
# With KFolds and shuffle, the data is shuffled once at the start, and then divided into the number of desired splits. 
# The test data is always one of the splits, the train data is the rest.

# In ShuffleSplit, the data is shuffled every time, and then split. 
# This means the test sets may overlap between the splits:
#                                Test, first row 3 and third row 3, first row 9 and fourth row 9.

In [None]:
# Thus, in ShuffleSplit test_size can be specified, for instance .2 means 1-.2 = .8 for training.
# While, KFolds depends on the size of the data and K (test set should not overlap), thus, if data is len 10, 
# and K is 5, then test size is 10/5=2 in order to never overlap.

### Second, let's create the K-Fold cross validation

In [41]:
DB_Path = '/Users/dfreire/Dropbox/Datasets/small_dataset/train/'

In [42]:
def Load_Data_Fold(DB_path, k=2):
    #Numeric labels
    lab = preprocessing.LabelEncoder()
    lab.fit(os.listdir(DB_path))
    labels = lab.transform(os.listdir(DB_path))
    inv = lab.inverse_transform(labels)
    class_dict = dict(zip(inv, labels))
    #inv_class_dict = {k: v for v,k in class_dict.items()} to decode labels if necessary

    #Read each folder
    data=[]
    labels=[]
    for class_ in os.listdir(DB_path):
        dat = [os.path.join(DB_path, class_, img) for img in os.listdir(os.path.join(DB_path, class_))]
        lab = np.ones(len(dat))*class_dict[class_]
        
        labels = np.concatenate((labels,lab))
        data = data + dat
    folds = list(StratifiedKFold(n_splits=k, shuffle=True, random_state=1).split(data, labels))
    
    return folds, data, labels

In [43]:
folds, data, labels = Load_Data_Fold(DB_path=DB_Path, k=4)

In [44]:
len(folds)

4

In [45]:
len(folds[0])

2

In [46]:
len(folds[0][0])

1500

In [47]:
len(folds[0][1])

500

In [48]:
2000/4

500.0

In [76]:
s = np.concatenate((a,b))

In [77]:
s

array([2., 2., 2., 2., 2., 3., 3., 3., 3., 3.])

In [None]:
for j, (train_idx, val_idx) in enumerate(folds):
    
    print('\nFold ',j)
    X_train_cv = data[train_idx]
    y_train_cv = labels[train_idx]
    X_valid_cv = data[val_idx]
    y_valid_cv= labels[val_idx]
    
    name_weights = "final_model_fold" + str(j) + "_weights.h5"
    callbacks = get_callbacks(name_weights = name_weights, patience_lr=10)
    generator = gen.flow(X_train_cv, y_train_cv, batch_size = batch_size)
    model = get_model()
    model.fit_generator(
                generator,
                steps_per_epoch=len(X_train_cv)/batch_size,
                epochs=15,
                shuffle=True,
                verbose=1,
                validation_data = (X_valid_cv, y_valid_cv),
                callbacks = callbacks)
    
    print(model.evaluate(X_valid_cv, y_valid_cv))

In [None]:
#Inverse from to_categorical

In [24]:
from numpy.random import randint
from numpy import argmax
from keras.utils.np_utils import to_categorical
k = 8
n = 10
x = randint(0, k, (n,))
print(x)
cat = to_categorical(x, k)
print(cat)
inv = np.array([val for val in [argmax(i) for i in cat]])
print(inv)

[2 1 4 4 2 5 3 1 2 7]
[[0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]
[2 1 4 4 2 5 3 1 2 7]


In [30]:
lab = preprocessing.LabelEncoder()
lab.fit(os.listdir(DB_Path))
labels = lab.transform(os.listdir(DB_Path))
inv = lab.inverse_transform(labels)
class_dict = dict(zip(inv, labels))

In [35]:
inv_dict = {v: k for k,v in class_dict.items()}

In [37]:
inv_dict[0]

'cats'