In [1]:
import numpy as np
import glob
import math 
import random
import shutil
import cv2
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [6]:
# parameter settings
path = "X:/XXXX/XXXX" # root directory
groups = ["G2", "notG2"] # class name

if(path[-1]!="/"):
    path=path + "/"
root_dir = path
train_dir =path + "image/train/"
test_dir = path + "image/test/"
export_dir = path+"/data" # directory for data export

In [7]:
# split files into train and test

for i, group in enumerate(groups):
    print("----Processing {}----".format(group))
    image_dir = root_dir + "image/" + group # original images
    move_train_dir = train_dir + group # destination; training data
    move_test_dir = test_dir + group # destination; test data
    files = list(glob.glob(image_dir+"/*.bmp"))
    print("Files detected:")
    print(len(files)) 
    
    # 20% of data will be moved to "test"
    th = math.floor(len(files)*0.2)
    random.shuffle(files)
    for i in range(th):
        shutil.move(files[i],move_test_dir)
    
    # move rest of data to "train"
    files = glob.glob(image_dir+"/*.bmp")
    for file in files:
        shutil.move(file, move_train_dir)

print("----All done----")

----Processing G2----
Files detected:
300
----Processing notG2----
Files detected:
300
----All done----


In [8]:
# augmentation and image-to-array conversion (numpy) for CNN

# augmentation
datagen = ImageDataGenerator(rotation_range=20, # range for random rotation
                             width_shift_range=0.1, # range for horizontal shift
                             height_shift_range=0.1,# range for vertical shift
                             )

# run augmentation and image-to-array conversion for training data
X = []
Y = []
for i,group in enumerate(groups):
    image_dir = train_dir + group
    files = glob.glob(image_dir+"/*")
    print("----Processing train: {}----".format(group))
    for id,file in enumerate(files):
        img = cv2.imread(file)
        img = cv2.resize(img,dsize=(135, 135)) # optional: unify image size
        X.append(img)
        Y.append(i)        
        
        # duplicate and rotatate images by 90, 180, or 270 degrees (random),
        # followed by ImageDataGenerator processing
        for j in range(3): # repeat 3 times to augment images by 4 times
            k=random.randint(1,3)
            copy=np.rot90(img,k) # rotate image on axis=(0,1) for k times
            copy = copy[None,...] # comvert data from 3D to 4D
            g = datagen.flow(copy, batch_size=1)
            batches = g.next()
            batches=batches[0,...] # restore 3D data from 4D
            X.append(batches)
            Y.append(i)
    
X = np.array(X)
Y = np.array(Y)
np.save(export_dir+"/x_train.npy",X)
np.save(export_dir+"/y_train.npy",Y)
print("Number of files generated:")
print(len(Y))

# run augmentation (optional) and image-to-array conversion for test data
X = []
Y = []
for i,group in enumerate(groups):
    image_dir = test_dir + group
    files = glob.glob(image_dir+"/*")
    print("----Processing test: {}----".format(group))
    for id,file in enumerate(files):
        img = cv2.imread(file)
        img = cv2.resize(img,dsize=(135, 135)) # optional: unify image size
        X.append(img)
        Y.append(i)        
        
X = np.array(X)
Y = np.array(Y)
np.save(export_dir+"/X_test.npy",X)
np.save(export_dir+"/Y_test.npy",Y)
print("Number of files generated:")
print(len(Y))
print("----All done----")

----Processing train: G2----
----Processing train: notG2----
Number of files generated:
1920
----Processing test: G2----
----Processing test: notG2----
Number of files generated:
120
----All done----


In [14]:
print(X.shape)

(113, 135, 135, 3)
