### skim

In [2]:
import os

In [3]:
def count_files(folder_path):
    count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    return count

In [5]:
folder_paths = [
    './7segdataset_raw/0', './7segdataset_raw/1', './7segdataset_raw/2', 
    './7segdataset_raw/3', './7segdataset_raw/4', './7segdataset_raw/5', 
    './7segdataset_raw/6', './7segdataset_raw/7', './7segdataset_raw/8', './7segdataset_raw/9'
]

for folder_path in folder_paths: 
    print(f"{folder_path} : {count_files(folder_path)}")

./7segdataset_raw/0 : 262
./7segdataset_raw/1 : 222
./7segdataset_raw/2 : 236
./7segdataset_raw/3 : 204
./7segdataset_raw/4 : 184
./7segdataset_raw/5 : 198
./7segdataset_raw/6 : 200
./7segdataset_raw/7 : 158
./7segdataset_raw/8 : 190
./7segdataset_raw/9 : 177


### augment & preprocess

In [9]:
%%capture
%pip install pillow numpy 

In [15]:
import os 
import numpy as np
from PIL import Image
import uuid
import random

In [6]:
IMG_WIDTH, IMG_HEIGHT = 224, 224

In [13]:
def process_image(image_path, convert=False): 
    img = Image.open(image_path)

    resized_img = img.resize((IMG_WIDTH, IMG_HEIGHT))

    gray_img = resized_img.convert("L")

    if convert: 
        img_array = np.array(gray_img)
        noise = np.random.randint(0, 50, img_array.shape, dtype='uint8')
        noisy_img_array = np.clip(img_array+noise, 0, 255)
        noisy_img = Image.fromarray(noisy_img_array)
    
        rotated_img = noisy_img.rotate(np.random.randint(0, 15))

        return rotated_img
    
    return gray_img


In [23]:
def augment(folder_path, dest_count, dest_folder_path): 
    dest_folder_path = os.path.join(dest_folder_path, os.path.basename(folder_path))
    os.makedirs(dest_folder_path, exist_ok=True) 

    image_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".gif"}
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(tuple(image_extensions))]

    for image_file in image_files:
        processed_img = process_image(os.path.join(folder_path, image_file), convert=False)
        output_name = str(uuid.uuid4())+'.jpg'
        processed_img.save(os.path.join(dest_folder_path, output_name))

    aug_count = dest_count - len(image_files) if dest_count > len(image_files) else 0 
    for _ in range(aug_count): 
        image_file = random.choice(image_files)
        processed_img = process_image(os.path.join(folder_path, image_file), convert=True)
        output_name = str(uuid.uuid4())+'.jpg'
        processed_img.save(os.path.join(dest_folder_path, output_name))
    
    print(f"from {folder_path} to {dest_folder_path}, augmented {dest_count} images")

In [24]:
folder_paths = [
    './7segdataset_raw/0', './7segdataset_raw/1', './7segdataset_raw/2', 
    './7segdataset_raw/3', './7segdataset_raw/4', './7segdataset_raw/5', 
    './7segdataset_raw/6', './7segdataset_raw/7', './7segdataset_raw/8', './7segdataset_raw/9'
]
output_folder = './7segdataset_aug'

for folder_path in folder_paths: 
    augment(folder_path, dest_count=300, dest_folder_path=output_folder)

from ./7segdataset_raw/0 to ./7segdataset_aug/0, augmented 300 images
from ./7segdataset_raw/1 to ./7segdataset_aug/1, augmented 300 images
from ./7segdataset_raw/2 to ./7segdataset_aug/2, augmented 300 images
from ./7segdataset_raw/3 to ./7segdataset_aug/3, augmented 300 images
from ./7segdataset_raw/4 to ./7segdataset_aug/4, augmented 300 images
from ./7segdataset_raw/5 to ./7segdataset_aug/5, augmented 300 images
from ./7segdataset_raw/6 to ./7segdataset_aug/6, augmented 300 images
from ./7segdataset_raw/7 to ./7segdataset_aug/7, augmented 300 images
from ./7segdataset_raw/8 to ./7segdataset_aug/8, augmented 300 images
from ./7segdataset_raw/9 to ./7segdataset_aug/9, augmented 300 images


In [26]:
folder_paths = [
    './7segdataset_aug/0', './7segdataset_aug/1', './7segdataset_aug/2', 
    './7segdataset_aug/3', './7segdataset_aug/4', './7segdataset_aug/5', 
    './7segdataset_aug/6', './7segdataset_aug/7', './7segdataset_aug/8', './7segdataset_aug/9'
]

for folder_path in folder_paths: 
    print(f"{folder_path} : {count_files(folder_path)}")

./7segdataset_aug/0 : 300
./7segdataset_aug/1 : 300
./7segdataset_aug/2 : 300
./7segdataset_aug/3 : 300
./7segdataset_aug/4 : 300
./7segdataset_aug/5 : 300
./7segdataset_aug/6 : 300
./7segdataset_aug/7 : 300
./7segdataset_aug/8 : 300
./7segdataset_aug/9 : 300


### split

In [36]:
import os
import shutil

In [47]:
def split(root_folder_path, dest_folder_path, split_rate=(0.7, 0.2, 0.1)):     
    assert round(sum([r for r in split_rate])) == 1 , "total sum of split rates should be 1(100%)."

    os.makedirs(dest_folder_path, exist_ok=True)

    class_folders = [f for f in os.listdir(root_folder_path) if os.path.isdir(os.path.join(root_folder_path, f))]

    for class_folder in class_folders: 
        image_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".gif"}
        folder_path = os.path.join(root_folder_path, class_folder)
        image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(tuple(image_extensions))]
        random.shuffle(image_files)
        
        total =  len(image_files)
        train_size = int(total * split_rate[0])
        val_size = int(total * split_rate[1])
        test_size = total - train_size - val_size

        train_set = image_files[:train_size]
        val_set = image_files[train_size:train_size + val_size]
        test_set = image_files[train_size + val_size:]

        train_class_folder = os.path.join(dest_folder_path, 'train', class_folder)
        val_class_folder = os.path.join(dest_folder_path, 'val', class_folder)
        test_class_folder = os.path.join(dest_folder_path, 'test', class_folder)
        os.makedirs(train_class_folder, exist_ok=True)
        os.makedirs(val_class_folder, exist_ok=True)
        os.makedirs(test_class_folder, exist_ok=True)

        for train_img in train_set: 
            shutil.copy(os.path.join(folder_path, train_img), os.path.join(train_class_folder, train_img))
        for val_img in val_set: 
            shutil.copy(os.path.join(folder_path, val_img), os.path.join(val_class_folder, val_img))
        for test_img in test_set: 
            shutil.copy(os.path.join(folder_path, test_img), os.path.join(test_class_folder, test_img))

        print(f"from {folder_path} to {dest_folder_path}, {total} copied with split (train,val,test)={split_rate}" )                      


In [None]:
root_folder_path = './7segdataset_aug'
dest_folder_path = './7segdataset_aug_split'

split(root_folder_path, dest_folder_path)

from ./7segdataset_aug/9 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/0 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/7 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/6 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/1 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/8 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/4 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/3 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/2 to ./7segdataset_aug_split, 300 copied with split (train,val,test)=(0.7, 0.2, 0.1)
from ./7segdataset_aug/5 to 

In [51]:
folder_paths = [
    './7segdataset_aug_split/train/0', './7segdataset_aug_split/val/0', './7segdataset_aug_split/test/0', 
]

for folder_path in folder_paths: 
    print(f"{folder_path} : {count_files(folder_path)}")

./7segdataset_aug_split/train/0 : 210
./7segdataset_aug_split/val/0 : 60
./7segdataset_aug_split/test/0 : 30
