### Import packages

In [1]:
import numpy as np
import cv2
import os
import random
import pickle
from sklearn.model_selection import KFold

PATH_DRIVERS = "driver_imgs_list.csv"
CATEGORIES = ["c0","c1","c2","c3","c4","c5","c6","c7","c8","c9"]
DATA_DIR = "imgs/train"
LOAD_FROM_CACHE = True
np.random.seed(2020)
random_state = 2020

### Definition of functions

In [2]:
def read_image(path, img_rows, img_cols, color_type=3):
    
    if color_type == 1:
        image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    else:
        image = cv2.imread(path, cv2.IMREAD_COLOR)
    
    resized_image = cv2.resize(image, (img_cols, img_rows))
    
    return resized_image

def get_driver_dict():
    
    image_driver = dict()
    driver_dict = dict()
    file = open(PATH_DRIVERS, 'r')
    line = file.readline()
    
    while(True):
        line = file.readline()
        if line == '':
            break   
        array = line.strip().split(',')
        image_driver[array[2]] = array[0]
        if array[0] not in driver_dict.keys():
            driver_dict[array[0]] = [(array[1], array[2])]
        else:
            driver_dict[array[0]].append((array[1], array[2]))
    
    file.close()
    return image_driver, driver_dict

def load_images(img_rows, img_cols, color_type=1):
    
    images = []
    images_id = []
    labels = []  
    driver_id = []
    
    image_driver, driver_dict = get_driver_dict()
    
    for category in CATEGORIES:
        path = os.path.join(DATA_DIR, category)
        label = CATEGORIES.index(category)
        
        print("Load images from directory: " + str(category))
        for img in os.listdir(path):
            
            try:
                image = read_image(os.path.join(path, img), img_rows, img_cols, color_type)     
                images.append(image)
                labels.append(label)
                images_id.append(img)
                driver_id.append(image_driver[img])
                
            except Exception as e:
                pass
            
    unique_drivers = sorted(list(set(driver_id)))
    print('Unique drivers: {}'.format(len(unique_drivers)))
    print(unique_drivers)
    
    return images, labels, images_id, driver_id, unique_drivers

def cache_data(data, path):
    
    if not os.path.exists(os.path.dirname(path)):
        os.mkdir(os.path.dirname(path))
        
    if os.path.isdir(os.path.dirname(path)):
        file = open(path, 'wb')
        pickle.dump(data, file)
        file.close()


def restore_data(path):
    data = dict()
    if os.path.isfile(path):
        file = open(path, 'rb')
        data = pickle.load(file)
    return data

def select_drivers(images, labels, driver_id, selected_driver):
    
    selected_images = []
    selected_labels = []
    selected_index = []
    
    for index in range(len(driver_id)):
        if driver_id[index] in selected_driver:
            selected_images.append(images[index])
            selected_labels.append(labels[index])
            selected_index.append(index)
    
    return np.array(selected_images), np.array(selected_labels), np.array(selected_index)

def read_images(img_rows, img_cols, color_type=3):
    
    cache_path = os.path.join('cache', 'images_' + str(img_rows) + 'x' + str(img_cols) + 'x' + str(color_type) + '.dat')
    
    if not os.path.isfile(cache_path) or LOAD_FROM_CACHE == False:
        images, labels, images_id, driver_id, unique_drivers = load_images(img_rows, img_cols, color_type)
        cache_data((images, labels, images_id, driver_id, unique_drivers), cache_path)
    else:
        print("Restore images from cache")
        (images, labels, images_id, driver_id, unique_drivers) = restore_data(cache_path)
        
    return images, labels, images_id, driver_id, unique_drivers



In [3]:
def create_k_folds(nfolds=5, img_rows=128, img_cols=128, color_type=3):
    
    images, labels, images_id, driver_id, unique_drivers = read_images(img_rows, img_cols, color_type)
    num_fold = 0
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=random_state)
    
    for train_drivers, val_drivers in kf.split(unique_drivers):
        
        unique_train_drivers = [unique_drivers[i] for i in train_drivers]
        X_train, Y_train, train_index = select_drivers(images, labels, driver_id, unique_train_drivers)
        
        unique_val_drivers = [unique_drivers[i] for i in val_drivers]
        X_val, Y_val, val_index = select_drivers(images, labels, driver_id, unique_val_drivers)

        num_fold += 1
        
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train))
        print('Split valid: ', len(X_val))
        print('Train drivers: ', unique_train_drivers)
        print('Test drivers: ', unique_val_drivers)
        
        cache_path = os.path.join('cache/directory_128', 'k_' +  str(num_fold) + '.dat')
        cache_data((X_train, Y_train, X_val, Y_val), cache_path)


### Creation of 5 folds (each driver is present once in th validation and four times in the training set)

In [4]:
create_k_folds(5)

Load images from directory: c0
Load images from directory: c1
Load images from directory: c2
Load images from directory: c3
Load images from directory: c4
Load images from directory: c5
Load images from directory: c6
Load images from directory: c7
Load images from directory: c8
Load images from directory: c9
Unique drivers: 26
['p002', 'p012', 'p014', 'p015', 'p016', 'p021', 'p022', 'p024', 'p026', 'p035', 'p039', 'p041', 'p042', 'p045', 'p047', 'p049', 'p050', 'p051', 'p052', 'p056', 'p061', 'p064', 'p066', 'p072', 'p075', 'p081']
Start KFold number 1 from 5
Split train:  17721
Split valid:  4703
Train drivers:  ['p002', 'p014', 'p015', 'p016', 'p021', 'p022', 'p024', 'p026', 'p035', 'p039', 'p041', 'p047', 'p050', 'p051', 'p056', 'p061', 'p064', 'p066', 'p072', 'p081']
Test drivers:  ['p012', 'p042', 'p045', 'p049', 'p052', 'p075']
Start KFold number 2 from 5
Split train:  17121
Split valid:  5303
Train drivers:  ['p002', 'p012', 'p014', 'p015', 'p024', 'p026', 'p035', 'p039', 'p041'