This notebook processes a dataset of images by splitting them into training, validation, and test sets. It calculates the number of images for each set, creates the necessary directories, and moves the images accordingly. This prepares the data for subsequent machine learning model training and evaluation.

In [45]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from tqdm import tqdm
import re

from skimage.feature import hog
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pickle, shutil, random

Path.cwd()

PosixPath('/home/jovyan/work/notebooks/internal')

In [55]:
PATH = Path('/home/jovyan/work/output/extracted_faces')
LABELS = [f.name for f in PATH.iterdir() if f.is_dir()]

NUM_IMAGES = len(list(PATH.rglob('*.jpg')))
IMAGES = list(PATH.rglob('*.jpg'))

NEG = [str(file) for file in IMAGES if re.search('_1.1.jpg', str(file))]
POS = [str(file) for file in IMAGES if re.search('_1.2.jpg', str(file))]
NEU = [str(file) for file in IMAGES if re.search('_0.0.jpg', str(file))]

['/home/jovyan/work/output/extracted_faces/neutral/START_S001_T1_La1_frame_0_timestamp_27.92_KI_0.0.jpg',
 '/home/jovyan/work/output/extracted_faces/neutral/START_S001_T1_La1_frame_0_timestamp_27.92_MU_0.0.jpg']

In [56]:
neg_path = '/home/jovyan/work/output/extracted_faces/negative'
pos_path = '/home/jovyan/work/output/extracted_faces/positive'
neu_path = '/home/jovyan/work/output/extracted_faces/neutral'

In [66]:
for file in tqdm(NEU):
    try:
        shutil.move(str(file), neu_path)
    except:
        continue

100%|██████████| 3043/3043 [00:15<00:00, 192.17it/s]


In [67]:
PATH = Path('/home/jovyan/work/output/extracted_faces')
LABELS = [f.name for f in PATH.iterdir() if f.is_dir()]

NUM_IMAGES = len(list(PATH.rglob('*.jpg')))

TRAIN_PROP = 0.6
VALID_PROP = 0.2
TEST_PROP = 0.2

N_TRAIN = int((NUM_IMAGES * TRAIN_PROP) + 0.5)
N_VALID = int((NUM_IMAGES * VALID_PROP) + 0.5)
N_TEST = NUM_IMAGES - N_TRAIN - N_VALID

print(f"[INFO] Total number of images ... {str(NUM_IMAGES)}")
print(f"[INFO] Number of images used in training ... {str(N_TRAIN)} ({str(TRAIN_PROP * 100)}%)")
print(f"[INFO] Number of images used in validation ...{str(N_VALID)} ({str(VALID_PROP * 100)}%)")
print(f"[INFO] Number of images used in testing ... {str(N_TEST)} ({str(TEST_PROP * 100)}%)")

[INFO] Total number of images ... 23914
[INFO] Number of images used in training ... 14348 (60.0%)
[INFO] Number of images used in validation ...4783 (20.0%)
[INFO] Number of images used in testing ... 4783 (20.0%)


In [68]:
def split_and_move():

    print(f"[INFO] Splitting files in train - test - val sets for each of {len(LABELS)} labels ...")
    for label in tqdm(LABELS):
        
        # create paths for each label
        folder_path = PATH / label
        train_destination = PATH / "train" / label
        val_destination = PATH / "val" / label
        test_destination = PATH / "test" / label
        
        # create the directories eg. "train / happy"
        train_destination.mkdir(parents=True, exist_ok=True)
        val_destination.mkdir(parents=True, exist_ok=True)
        test_destination.mkdir(parents=True, exist_ok=True)

        # get all the jpgs in the label file
        files = list(folder_path.rglob('*.jpg'))
        random.shuffle(files)

        train_n = (int((len(files) * TRAIN_PROP) + 0.5))
        val_n = (int((len(files) * VALID_PROP) + 0.2))

        for file_idx, file in enumerate(files):
            if file_idx < train_n:
                shutil.move(str(file), train_destination)
            elif file_idx < train_n + val_n:
                shutil.move(str(file), val_destination)
            else:
                shutil.move(str(file), test_destination)

        if folder_path.exists() and folder_path.is_dir():
            shutil.rmtree(folder_path)

    print(f"[INFO] DONE ...")


In [69]:
split_and_move()

[INFO] Splitting files in train - test - val sets for each of 3 labels ...


100%|██████████| 3/3 [03:37<00:00, 72.54s/it] 

[INFO] DONE ...



