In [29]:
import os
import random
import shutil
from typing import Tuple

In [30]:
class TestTrainDataHandler:
    def __init__(self, raw_data_path: str, test_data_path: str, train_data_path: str):
        self.raw_data_path = raw_data_path
        self.test_data_path = test_data_path
        self.train_data_path = train_data_path

    def generate_test_train_file_lists(self, num_test_files: int, num_train_files: int) -> Tuple[list, list]:
        file_list = [os.path.join(self.raw_data_path, file) for file in os.listdir(self.raw_data_path) if file.endswith(".jpg")]
        print(f"Length of raw file list is {len(file_list)}")
        test_file_list = random.sample(file_list, k=num_test_files)
        non_test_file_list = [x for x in file_list if x not in test_file_list]
        train_file_list = random.sample(non_test_file_list, k=num_train_files)
        return test_file_list, train_file_list

    @staticmethod
    def __copy_files_to_folder(folder_path: str, files_to_copy: list):
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        if len(os.listdir(folder_path)) > 0:
            for file in os.listdir(folder_path):
                os.remove(os.path.join(folder_path, file))
                print(f"Removing existing file {file}")

        for file in files_to_copy:
            shutil.copyfile(file, os.path.join(folder_path, os.path.basename(file)))
            print(f"{file} -> {folder_path}")

    def copy_test_train_data_files(self, test_file_list: list, train_file_list):
        self.__copy_files_to_folder(self.train_data_path, train_file_list)
        self.__copy_files_to_folder(self.test_data_path, test_file_list)

In [31]:
handler = TestTrainDataHandler(
    raw_data_path=os.path.join("data", "raw"),
    test_data_path=os.path.join("data", "test"),
    train_data_path=os.path.join("data", "train")
)

In [36]:
test_files, train_files = handler.generate_test_train_file_lists(num_test_files=33, num_train_files=33)

Length of raw file list is 102


In [39]:
handler.copy_test_train_data_files(test_files, train_files)

data/raw/t6_well1_15.jpg -> data/train
data/raw/t11_well3_15.jpg -> data/train
data/raw/t8_well1_15.jpg -> data/train
data/raw/t6_well3_4.jpg -> data/train
data/raw/t4_well0_10.jpg -> data/train
data/raw/t13_well0_0.jpg -> data/train
data/raw/t3_well0_0.jpg -> data/train
data/raw/t12_well1_0.jpg -> data/train
data/raw/t15_well0_10.jpg -> data/train
data/raw/t11_well0_0.jpg -> data/train
data/raw/t2_well3_15.jpg -> data/train
data/raw/t11_well3_4.jpg -> data/train
data/raw/t1_well0_0.jpg -> data/train
data/raw/t9_well0_10.jpg -> data/train
data/raw/t12_well0_10.jpg -> data/train
data/raw/t16_well0_10.jpg -> data/train
data/raw/t3_well1_0.jpg -> data/train
data/raw/t3_well3_4.jpg -> data/train
data/raw/t12_well1_15.jpg -> data/train
data/raw/t2_well1_0.jpg -> data/train
data/raw/t1_well1_0.jpg -> data/train
data/raw/t16_well1_15.jpg -> data/train
data/raw/t11_well0_10.jpg -> data/train
data/raw/t11_well1_0.jpg -> data/train
data/raw/t8_well3_15.jpg -> data/train
data/raw/t10_well3_15.jpg