In [1]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import cv2
import os

In [2]:
imagepaths = []
categories_set = set()
for root, dirs, files in os.walk("dataset UFMA", topdown=False): 
    for name in files:
        path = os.path.join(root, name)
        #if name.startswith('c'):
        #    continue
        if path.upper().endswith("BMP"): # We want only the images
            imagepaths.append(path)
            categories_set.add(os.path.split(root)[1])
categories_list = list(sorted(categories_set))

categories = dict(zip(categories_list, range(len(categories_list))))
print(categories)

print(len(imagepaths))

{'1': 0, '10': 1, '11': 2, '12': 3, '13': 4, '14': 5, '15': 6, '16': 7, '17': 8, '18': 9, '19': 10, '2': 11, '20': 12, '21': 13, '22': 14, '23': 15, '24': 16, '25': 17, '26': 18, '27': 19, '28': 20, '29': 21, '3': 22, '30': 23, '31': 24, '32': 25, '33': 26, '34': 27, '35': 28, '36': 29, '37': 30, '38': 31, '39': 32, '4': 33, '40': 34, '41': 35, '42': 36, '43': 37, '44': 38, '45': 39, '46': 40, '47': 41, '48': 42, '49': 43, '5': 44, '50': 45, '51': 46, '52': 47, '53': 48, '54': 49, '55': 50, '56': 51, '57': 52, '58': 53, '59': 54, '6': 55, '60': 56, '61': 57, '7': 58, '8': 59, '9': 60}
12200


In [3]:
X = []
y = []
train_or_test = []
append_X = X.append
append_y = y.append
append_train_or_test = train_or_test.append
for path in imagepaths:
    img = cv2.imread(path) # Reads image and returns np.array
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Converts into the corret colorspace (GRAY)
    img = cv2.resize(img, (185, 180)) # Reduce image size so training can be faster
    append_X(img)

    # Processing label in image path
    path2, category = os.path.split(os.path.split(path)[0])
    append_train_or_test(os.path.split(path2)[1])
    label = categories[category]
    append_y(label)

# Turn X and y into np.array to speed up train_test_split
X = np.array(X, dtype="uint8")
X = X.reshape(len(imagepaths), 180, 185, 1) # Needed to reshape so CNN knows it's different images
y = np.array(y)
X.shape, y.shape

((12200, 180, 185, 1), (12200,))

In [5]:
tr_or_te = np.array(train_or_test)
train, = np.where(tr_or_te == 'grpotrain')
test, = np.where(tr_or_te == 'grpotest')
train, test

(array([ 6100,  6101,  6102, ..., 12197, 12198, 12199], dtype=int64),
 array([   0,    1,    2, ..., 6097, 6098, 6099], dtype=int64))

In [6]:
i = 0
np.save(f"UFMA_K_Fold/train_UFMA_{i:02d}.npy", train)
np.save(f"UFMA_K_Fold/test_UFMA_{i:02d}.npy", test)

In [4]:
np.save("raw_X_UFMA3.npy", X)
np.save("raw_y_UFMA3.npy", y)

In [8]:
fp_x = open("raw_X_UFMA.npy", 'rb')
fp_y = open("raw_y_UFMA.npy", 'rb')
fp_x2 = open("raw_X_UFMA2.npy", 'rb')
fp_y2 = open("raw_y_UFMA.npy", 'rb')
fp_x.read() == fp_x2.read(), fp_y.read() == fp_y2.read()

(False, True)

In [9]:
fp_x.close()
fp_x2.close()
fp_y.close()
fp_y2.close()

In [8]:
seed = 1234

In [9]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [10]:
for i, (train, test) in enumerate(kfold.split(X, y), 1):
    np.save(f"UFMA_K_Fold/train_UFMA_{i:02d}.npy", train)
    np.save(f"UFMA_K_Fold/test_UFMA_{i:02d}.npy", test)