In [7]:
import os
import glob
import math
import shutil
import random

In [8]:
def check_makedir(thisDir):
    if not os.path.isdir(thisDir):
        os.mkdir(thisDir)

In [9]:
# define splits
splits = (0.7,0.15,0.15) # (0.7,0.15,0.15)

assert(sum(splits)==1.0)

print(f'Splitting data into train/val/test: {splits}')

# path to all available labels
BasePath = './ManuallyLabelled_v2/' #'./ManuallyLabelled_nocrop/'
TargetPath = './UNET_v6/' #'./UNET_v6/'
TrainDir = 'train1' #'train0'
ValDir = 'val1' #'val0'
TestDir = 'test1' #'test0'

Splitting data into train/val/test: (0.7, 0.15, 0.15)


In [10]:
check_makedir(TargetPath)
check_makedir(os.path.join(TargetPath,TrainDir))
check_makedir(os.path.join(TargetPath,TrainDir,'images'))
check_makedir(os.path.join(TargetPath,TrainDir,'masks'))

check_makedir(os.path.join(TargetPath,ValDir))
check_makedir(os.path.join(TargetPath,ValDir,'images'))
check_makedir(os.path.join(TargetPath,ValDir,'masks'))

check_makedir(os.path.join(TargetPath,TestDir))
check_makedir(os.path.join(TargetPath,TestDir,'images'))
check_makedir(os.path.join(TargetPath,TestDir,'masks'))


print(f'Loading data from: {BasePath}')
print(f'Writing data to: {TargetPath}')
print(f'subdirectories {TrainDir}, {ValDir}, and {TestDir}')

Loading data from: ./ManuallyLabelled_v2/
Writing data to: ./UNET_v6/
subdirectories train1, val1, and test1


In [12]:
ImList = os.listdir(os.path.join(BasePath,'images'))

SampleList = [x[:-8] for x in ImList]
SampleList = set(SampleList) # removes duplicates
SampleList = list(SampleList) # list of samples

print(f'There are {len(ImList)} manually labelled images from {len(SampleList)} samples')

print('Samples:')
print(SampleList)


There are 444 manually labelled images from 14 samples
Samples:
['PCB17KA7drh', 'PCB16KA1dlh', 'PCB5KA14drh', 'PCB17KA7dlh', 'PCB14KA14drh', 'PCB4KA14drh', 'PCB2KA21drh_try3', 'PCB11salrh', 'PCB1KA21drh', 'PCB3KA1dlh', 'PCB3KA21drh', 'PCB4KA14dlh', 'PCB16KA1drh', 'PCB1KA21dlh_offaxis_try3']


In [13]:
# mini example of how this will work
random.seed(5)

a = ['1','2','3','4','5','6','7','8','9','10']
numIms = len(a)
numVal, numTest = math.floor(numIms*splits[1]), math.ceil(numIms*splits[2])
numTrain = numIms - numVal - numTest # ensure that all images used
print(f'Splitting into Train: {numTrain}, Val: {numVal}, and Test: {numTest}')

splitIms = a.copy()
random.shuffle(splitIms)
print(f'a: {a}')
print(f'b: {splitIms}')

trainIms = splitIms[0:numTrain]
print(f'Train: {trainIms}')

valIms = splitIms[numTrain:numTrain+numVal]
print(f'Val: {valIms}')

testIms = splitIms[-numTest:]
print(f'Test: {testIms}')

print(f'Total: {len(trainIms)+len(valIms)+len(testIms)}')

Splitting into Train: 7, Val: 1, and Test: 2
a: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
b: ['3', '4', '2', '1', '9', '8', '7', '6', '5', '10']
Train: ['3', '4', '2', '1', '9', '8', '7']
Val: ['6']
Test: ['5', '10']
Total: 10


In [14]:
random.seed(5)
dry_run = False

totTrain = []
totTest = []
totVal = []
for thisSample in SampleList:
    theseIms = glob.glob(os.path.join(BasePath,'images',thisSample + '*.png'))
    
    numIms = len(theseIms)
    print(f'Sample {thisSample} has {numIms} images')
    
    
    numVal, numTest = math.floor(numIms*splits[1]), math.ceil(numIms*splits[2])
    numTrain = numIms - numVal - numTest # ensure that all images used
    
    print(f'Splitting into Train: {numTrain}, Val: {numVal}, and Test: {numTest}')
    
    splitIms = theseIms.copy()
    random.shuffle(splitIms)
    
    trainIms = splitIms[0:numTrain]
    valIms = splitIms[numTrain:numTrain+numVal]
    testIms = splitIms[-numTest:]
    #print(f'total ims: {len(trainIms)+len(valIms)+len(testIms)}')
    totTrain.append(len(trainIms))
    totTest.append(len(testIms))
    totVal.append(len(valIms))
    
    for thisImage in trainIms:
        imName = os.path.split(thisImage)[1]
        src2 = os.path.join(BasePath,'masks',imName)
        dst1 = os.path.join(TargetPath,TrainDir,'images',imName)
        dst2 = os.path.join(TargetPath,TrainDir,'masks',imName)
        
        if not dry_run:
            shutil.copyfile(thisImage,dst1)
            shutil.copyfile(src2,dst2)
    
    for thisImage in valIms:
        imName = os.path.split(thisImage)[1]
        src2 = os.path.join(BasePath,'masks',imName)
        dst1 = os.path.join(TargetPath,ValDir,'images',imName)
        dst2 = os.path.join(TargetPath,ValDir,'masks',imName)
        
        if not dry_run:
            shutil.copyfile(thisImage,dst1)
            shutil.copyfile(src2,dst2)
    
    for thisImage in testIms:
        imName = os.path.split(thisImage)[1]
        src2 = os.path.join(BasePath,'masks',imName)
        dst1 = os.path.join(TargetPath,TestDir,'images',imName)
        dst2 = os.path.join(TargetPath,TestDir,'masks',imName)
        
        if not dry_run:
            shutil.copyfile(thisImage,dst1)
            shutil.copyfile(src2,dst2)

Sample PCB17KA7drh has 36 images
Splitting into Train: 25, Val: 5, and Test: 6
Sample PCB16KA1dlh has 27 images
Splitting into Train: 18, Val: 4, and Test: 5
Sample PCB5KA14drh has 18 images
Splitting into Train: 13, Val: 2, and Test: 3
Sample PCB17KA7dlh has 30 images
Splitting into Train: 21, Val: 4, and Test: 5
Sample PCB14KA14drh has 28 images
Splitting into Train: 19, Val: 4, and Test: 5
Sample PCB4KA14drh has 35 images
Splitting into Train: 24, Val: 5, and Test: 6
Sample PCB2KA21drh_try3 has 6 images
Splitting into Train: 5, Val: 0, and Test: 1
Sample PCB11salrh has 70 images
Splitting into Train: 49, Val: 10, and Test: 11
Sample PCB1KA21drh has 34 images
Splitting into Train: 23, Val: 5, and Test: 6
Sample PCB3KA1dlh has 30 images
Splitting into Train: 21, Val: 4, and Test: 5
Sample PCB3KA21drh has 36 images
Splitting into Train: 25, Val: 5, and Test: 6
Sample PCB4KA14dlh has 34 images
Splitting into Train: 23, Val: 5, and Test: 6
Sample PCB16KA1drh has 33 images
Splitting into 

In [15]:
NTrain = sum(totTrain)
NVal = sum(totVal)
NTest = sum(totTest)

print(f'Total training images: {NTrain}')
print(f'Total validation images: {NVal}')
print(f'Total testing images: {NTest}')

Total training images: 308
Total validation images: 61
Total testing images: 75
