### This function splits the files in the s_dir into train, test, and validation files stored in the dest_dir.
#### s_dir is the full path to the directory containing the files to be split
#### dest_dir is the full path to the destination directory. If it does not exist it is created.
#### train_size is a float between 0.0 and 1.0 indicating the percentage of file to be allocated as training files
#### test_size is a float between 0.0 and 1.0 indicating the percentage of file to be allocated as test files
#### In the dest_dir three sub directories 'train', 'test' and 'valid' are createdand used to store the training files,
#### test files and validation files.
#### If these sub directories already exist they are check for existing content.If content is found a notice is printed
#### to that effect. The user is then prompted to enter 'D' to delete the content, 'Q' to terminate program execution
#### or 'C' to continue. If 'C' is selected the content is not removed however files may be over written if any existing
#### files have the same file name as the new files being added to the sub directory.
#### Note if the test, train and valid directories exist and have content, and the user elects 'c' to continue
#### sub directories and files from the s_dir are appended to the content of the test, train and valid subdirectories
#### in the dest_dir
#### This function utlilizes tqdm and sklearn which must be installed in your working environment


In [1]:
import os
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
def tr_te_val_split(s_dir, dest_dir, train_size, test_size): 
    if train_size <0 or train_size >1:
        print('*** Train size must be a float between 0.0 and 1.0, process terminated ***')
        return
    if test_size <0 or test_size >1:
        print('*** Test size must be a float between 0.0 and 1.0, process terminated ***')
        return
    if test_size + train_size >1:
        print ('*** The sum of the train size plus the test size must be <= 1, process terminating ***')
        return
    
    remainder= 1-train_size # percent available for test and validation
    test_size= test_size/remainder
    if os.path.isdir(dest_dir)==False:
        os.mkdir(dest_dir)
        print ('The dest_dir you specified ', dest_dir, ' does not exist, created it for you ')        
    dest_list=os.listdir(dest_dir) # list content of destination directory
    for d in ['train', 'test', 'valid']:
        d_path=os.path.join(dest_dir,d)
        if d not in dest_list:
            os.mkdir(d_path)  # create train, test and valid directories in the destination directory
        else: # check to see if there are any files in these directories
            d_list=os.listdir(d_path)
            if len(d_list) > 0:  # there are files or directories in d
                cycle=True
                print('*** WARNING***  there is content in ', d_path)
                while cycle:
                    ans=input(' enter D to delete content, C to continue and keep content or Q to Quit ')
                    if ans not in ['D', 'd', 'C', 'c', 'Q', 'q']:
                        print('your response ', ans, ' was not a  D, C or Q, try again')
                    else:
                        cycle=False
                        if ans in ['Q', 'q']:
                            print ('**** PROCESS TERMINATED BY USER ****')
                            return
                        else:
                            if ans in ['D', 'd']:
                                print(' Removing all files and sub directories in ', d_path)
                                for f in d_list:
                                    f_path=os.path.join (d_path,f)
                                    if os.path.isdir(f_path):                                        
                                        shutil.rmtree(f_path)                                        
                                    else:
                                        os.remove(f_path)
            
    class_list=os.listdir(s_dir)  # listof classes     
    for klass in tqdm(class_list): # iterate through the classes
        klass_path=os.path.join(s_dir, klass) # path to class directory
        f_list=os.listdir(klass_path) # get the list of file names
        ftrain, ftv= train_test_split(f_list, train_size=train_size, random_state=123 )
        ftest, fvalid= train_test_split(ftv, train_size= test_size, random_state=123 )        
        for d in ['train', 'test', 'valid']:
            d_path=os.path.join(dest_dir,d)
            d_class_path=os.path.join(d_path,klass)
            if os.path.isdir(d_class_path)==False:
                os.mkdir(d_class_path)
            if d=='train':
                fx=ftrain
            elif d=='test':
                fx=ftest
            else:
                fx=fvalid
            for f in fx:
                f_path=os.path.join(klass_path, f)
                d_f_path=os.path.join(d_class_path,f)
                shutil.copy(f_path, d_f_path)
    for d in ['train', 'test', 'valid']:
        file_count=0
        d_path=os.path.join(dest_dir, d)
        d_list=os.listdir(d_path)
        for klass in d_list:
            klass_path=os.path.join(d_path, klass)
            klass_list=os.listdir(klass_path)
            d_count=len(klass_list)
            file_count=file_count + d_count
            if d == 'train':
                tr_count=file_count
            elif d =='test':
                te_count=file_count
            else:
                tv_count=file_count
    print ('Process Completed ', tr_count, ' training files ', te_count, ' test files and ', tv_count, ' validation files were partitioned')
    
        

In [4]:
source_dir=r'C:\Temp\\people\test'
destination_dir=r'C:\Temp\natural'
train_percent=.8
test_percent=.10
tr_te_val_split(source_dir, destination_dir, train_percent, test_percent)

 enter D to delete content, C to continue and keep content or Q to Quit c
 enter D to delete content, C to continue and keep content or Q to Quit c
 enter D to delete content, C to continue and keep content or Q to Quit c


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.35s/it]

Process Completed  5996  training files  750  test files and  753  validation files were partitioned



