# Set the environment

In [1]:
!pip install opencv-python

[33mDEPRECATION: mecab-python 0.996-ko-0.9.2 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of mecab-python or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
import cv2
import os
import glob
import shutil
import random
import string
import numpy as np

In [3]:
PATH_DEFECT = '/Users/shim/dl-python-ImageDetection/dataset/Defect_images/'
PATH_MASK = '/Users/shim/dl-python-ImageDetection/dataset/Mask_images/'
PATH_NODEFECT = '/Users/shim/dl-python-ImageDetection/dataset/NODefect_images/'

## Import the Dataset 

In [4]:
random.seed(0)

defect_list = glob.glob(PATH_DEFECT + '*.png')
mask_list = glob.glob(PATH_MASK + '*.png')
pass_list = glob.glob(PATH_NODEFECT + '**/*.png')

# Match defect-mask pairs
new_defect_list = list()
new_mask_list = list()
for defect in defect_list:
    num = defect.split('/')[-1].split('_')[0]
    for mask in mask_list:
        num_mask = mask.split('/')[-1].split('_')[0]
        if num == num_mask:
            new_defect_list.append(defect)
            new_mask_list.append(mask)
            break
defect_list = new_defect_list
mask_list = new_mask_list

## Generate the first data

In [5]:
# The first dataset given
if os.path.exists('dataset/1') is False:
    os.mkdir('dataset/1')
for file_name in pass_list + defect_list:
    if random.randint(0, 9) < 2:
        barcode = ''.join(random.choices(string.ascii_letters + string.digits, k=16))
        shutil.copy2(file_name, 'dataset/1/' + barcode + '.png')

## Generate the second data

In [6]:
# The second dataset
if os.path.exists('dataset/2') is False:
    os.mkdir('dataset/2')
if os.path.exists('dataset/2/OK') is False:
    os.mkdir('dataset/2/OK')
if os.path.exists('dataset/2/FAIL') is False:
    os.mkdir('dataset/2/FAIL')
idx = 0
for file_name in pass_list:
    img = cv2.imread(file_name)
    height, width, _ = img.shape
    step = height // 2

    for i in range(width // step):
        w = i * step
        if w < width - height and random.randint(0, 9) < 2:
            patch = img[:, w:w+height, :]
            cv2.imwrite('dataset/2/OK/%04d.png' % idx, patch)
            idx += 1 

patch_list = list()
for item in zip(defect_list, mask_list):
    defect, mask = item

    img_d = cv2.imread(defect)
    img_m = cv2.imread(mask)

    height, width, _ = img_d.shape
    step = height // 2
    for i in range(width // step):
        w = i * step
        if w < width - height:
            patch = img_d[:, w:w+height, :]
            patch_d = img_m[:, w:w+height, :]
            if patch_d.sum() > 0:
                patch_list.append(patch)

random.shuffle(patch_list)
patch_list_fraction = patch_list[:len(patch_list)//3]
for idx, patch in enumerate(patch_list_fraction):
    cv2.imwrite('dataset/2/FAIL/%04d.png' % idx, patch)

## Generate the third data

In [7]:
# The third dataset
if os.path.exists('dataset/3') is False:
    os.mkdir('dataset/3')
if os.path.exists('dataset/3/OK') is False:
    os.mkdir('dataset/3/OK')
if os.path.exists('dataset/3/FAIL') is False:
    os.mkdir('dataset/3/FAIL')
if os.path.exists('dataset/3/MASK') is False:
    os.mkdir('dataset/3/MASK')
idx = 0
for file_name in pass_list:
    img = cv2.imread(file_name)
    height, width, _ = img.shape
    step = height // 2

    for i in range(width // step):
        w = i * step
        if w < width - height and random.randint(0, 9) < 3:
            patch = img[:, w:w+height, :]
            cv2.imwrite('dataset/3/OK/%04d.png' % idx, patch)
            idx += 1 

patch_pair_list = list()
for item in zip(defect_list, mask_list):
    defect, mask = item

    img_d = cv2.imread(defect)
    img_m = cv2.imread(mask)

    height, width, _ = img_d.shape
    step = height // 2
    for i in range(width // step):
        w = i * step
        if w < width - height:
            patch = img_d[:, w:w+height, :]
            patch_d = img_m[:, w:w+height, :]

            if patch_d.sum() > 0:
                patch_pair_list.append((patch, patch_d))

random.shuffle(patch_pair_list)
for idx, pair in enumerate(patch_pair_list):
    patch, patch_d = pair
    cv2.imwrite('dataset/3/FAIL/%04d.png' % idx, patch)
    cv2.imwrite('dataset/3/MASK/%04d.png' % idx, patch_d)

## Generate the train data 

In [8]:
# The test dataset
if os.path.exists('data/input_data') is False:
    os.mkdir('data/input_data')
if os.path.exists('data/output_csv') is False:
    os.mkdir('data/output_csv')
    
idx = 0
for file_name in pass_list:
    img = cv2.imread(file_name)
    height, width, _ = img.shape
    step = height // 2

    for i in range(width // step):
        w = i * step
        if w < width - height and random.randint(0, 9) < 5:
            patch = img[:, w:w+height, :]
            cv2.imwrite('data/input_data/ok_%04d.png' % idx, patch)
            idx += 1 

patch_pair_list = list()
for item in zip(defect_list, mask_list):
    defect, mask = item

    img_d = cv2.imread(defect)
    img_m = cv2.imread(mask)

    height, width, _ = img_d.shape
    step = height // 2
    for i in range(width // step):
        w = i * step
        if w < width - height:
            patch = img_d[:, w:w+height, :]
            patch_d = img_m[:, w:w+height, :]

            if patch_d.sum() > 0:
                patch_pair_list.append((patch, patch_d))

random.shuffle(patch_pair_list)
for idx, pair in enumerate(patch_pair_list):
    patch, patch_d = pair
    cv2.imwrite('data/input_data/fail_%04d.png' % idx, patch)