# Building the Training Dataset for fqhd

## Setup

In [1]:
import os
import random
import shutil
from tqdm import tqdm, trange, tqdm_notebook
from time import sleep

## Count the number of Image files

In [2]:
count = 0
dirname = "/home/edm/work/mldata/images1024x1024/"
os.listdir(dirname)

for (dirname, dirs, filenames) in os.walk(dirname):
   for filename in filenames:
       if filename.endswith('.png') :
           count = count + 1
print ('There are', count, '.png files')


There are 69989 .png files


## Split the file listing into separate sets

This generates a list of file names. 

In [3]:
filenames.sort()  # make sure that the filenames have a fixed order before shuffling
random.seed(230)
random.shuffle(filenames) # shuffles the ordering of filenames (deterministic given the chosen seed)

split_1 = int(0.8 * len(filenames))
split_2 = int(0.9 * len(filenames))


train_filenames = filenames[:split_1]
val_filenames = filenames[split_1:split_2]
test_filenames = filenames[split_2:]

In [4]:
print("There are", len(train_filenames), "files for training")
print("There are", len(val_filenames), "files for validation")
print("There are", len(test_filenames), "files for testing")

There are 55991 files for training
There are 6999 files for validation
There are 6999 files for testing


## Set up a Progress bar

We should set up a progress bar to get an idea of how long things are taking. Moving this large number of files is very time consuming and while this is being done, the cell will apper to have hung if we don't provide any feedback.

In [5]:
with tqdm(total=len(train_filenames)) as pbar:
    for x in train_filenames:
        pbar.update(1)

100%|██████████| 55991/55991 [00:00<00:00, 2193321.09it/s]


## Save the files to separate directories

**TODO**

* The destination directories have to already exist. Maybe at some later time I can add code to check to if they exist and create them dynamically

* The copy code below should be refactored into a general purpose function. Right now I am repeating it every time for each directory I need.

* The code below is pretty slow. Since I don't expect to do this very often, it may be OK, but it could  benefit from some parellelism. Maybe numba? https://numba.pydata.org/


In [6]:
# Copy the train split
print ("[INFO]: Creating the training dataset")
files = train_filenames
source  = "/home/edm/work/mldata/images1024x1024/"
destination = "/home/edm/work/mldata/fqhd/train/"

with tqdm(total=len(files)) as pbar:
    
# copy only the .png files. The flicker dataset includes a LICENSE.txt
# file and I don't want to include it
    
    for file in files:
        if file.endswith(".png"):
            src = source + file
            dest = destination + file
            shutil.copy2(src,destination)
        pbar.update(1) 
        pbar.set_description("Processing %s" % file, refresh = True)

Processing 33243.png:   0%|          | 39/55991 [00:00<04:52, 191.25it/s]

[INFO]: Creating the training dataset


Processing 18902.png: 100%|██████████| 55991/55991 [07:01<00:00, 132.88it/s]


Now the test files

In [7]:
# Copy the test split
tqdm.reset(pbar, total = None)
files = test_filenames
source  = "/home/edm/work/mldata/images1024x1024/"
destination = "/home/edm/work/mldata/fqhd/test/"

print ("[INFO]: Creating the test dataset")
with tqdm(total=len(files)) as pbar:
# copy only the .png files. The flicker dataset includes a LICENSE.txt
# file and I don't want to include it

    for file in files:
        if file.endswith(".png"):
            src = source + file
            dest = destination + file
            shutil.copy2(src,destination)
        pbar.update(1)
        pbar.set_description("Processing %s" % file, refresh = True)

Processing 63377.png:   1%|          | 39/6999 [00:00<00:34, 203.19it/s]

[INFO]: Creating the test dataset


Processing 21215.png: 100%|██████████| 6999/6999 [00:47<00:00, 146.37it/s]


Now the val files

In [8]:
# Copy the val split
tqdm.reset(pbar, total = None)
files = val_filenames
source  = "/home/edm/work/mldata/images1024x1024/"
destination = "/home/edm/work/mldata/fqhd/val/"

print ("[INFO]: Creating the validation dataset")
with tqdm(total=len(files)) as pbar:
# copy only the .png files. The flicker dataset includes a LICENSE.txt
# file and I don't want to include it

    for file in files:
        if file.endswith(".png"):
            src = source + file
            dest = destination + file
            shutil.copy2(src,destination)
        pbar.update(1)
        pbar.set_description("Processing %s" % file, refresh = True)

Processing 13441.png:   1%|          | 42/6999 [00:00<00:34, 202.02it/s]

[INFO]: Creating the validation dataset


Processing 65834.png: 100%|██████████| 6999/6999 [00:47<00:00, 146.13it/s]
