In [1]:
from shutil import copyfile

from ck_image_helpers import *
from hanziconv import HanziConv

import pandas as pd
import numpy as np
import skimage.io as io
import itertools
from perimetric_complexity import perimetricComplexity

import os
import shutil

# preprocessing parameters
STRETCH = False
SKELETONISE_METHOD = "lee"
IMG_SIZE = 300
PERIODS = ["Oracle", "Bronze", "Seal", "Simplified", "Traditional"]

# folder locations
ROOT = "/Users/ckemp/bigdatanonarchival/hanzi/smallprocessingtest_preprint"
#ROOT = "/Users/ckemp/bigdatanonarchival/hanzi/processingtest"
IMAGES = f"{ROOT}/Images"
DATA = f"{ROOT}/Data"
IMAGE_DATASETS = ["hanziyuan", "simplified_handwritten", "traditional_handwritten"] #, "hanziyuan, simplified_handwritten", "traditional_handwritten"]
#IMAGE_DATASETS = ["hanziyuan"]
CLD = f"{DATA}/cld.csv"
PATHOFF=3

# # to generate experimental stimuli
# SKELETONISE_METHOD = "none"
# IMAGES = "/Users/ckemp/u/mygithub/hanzi/experiments/exp_pictographic_ratings/stimuli/stimuliprocessing"
# IMAGE_DATASETS = ["oracle", "traditional", "hwtrad", "hwtrad_control", "traditional_control"]
# PATHOFF=1

# # to generate images for demo figure
# SKELETONISE_METHOD = "none"
# IMAGES = "/Users/ckemp/u/mygithub/hanzi/projectdocs/figures/demofig/Images"
# IMAGE_DATASETS = ["hanziyuan", "traditional_handwritten", "simplified_handwritten"]
# PATHOFF=1

# # to generate images for nn figure
# SKELETONISE_METHOD = "none"
# IMAGES = "/Users/ckemp/u/mygithub/hanzi/projectdocs/figures/nnfig/Images"
# IMAGE_DATASETS = ["hanziyuan", "traditional_handwritten", "simplified_handwritten"]
# PATHOFF=1
import skimage
print(skimage.__version__)


0.18.3


In [2]:
# minor stuff we need
def check_create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
        
def listdir(x):
    return [i for i in os.listdir(x) if i[0] != "."]

In [3]:
# copy images

def copy_image(from_image_path, to_folder):

    to_image_path = f"{to_folder}/{'_'.join(from_image_path.split('.')[0].split('/')[-PATHOFF:])}.png"

    if from_image_path.split(".")[1] == "svg":
        svg_to_png(from_image_path, to_image_path)
    else:
        copyfile(from_image_path, to_image_path)

for dataset in IMAGE_DATASETS:
    raw_folder = f"{IMAGES}/{dataset}_raw"
    processed_folder = f"{IMAGES}/{dataset}_{'stretched' if STRETCH else 'padded'}_{SKELETONISE_METHOD}"

    check_create_folder(processed_folder)

    image_paths = get_all_image_paths(raw_folder)
    image_paths = [i.replace("\\", "/") for i in image_paths]
    
    list(itertools.starmap(copy_image, [(image_path, processed_folder) for image_path in image_paths]))
 


In [4]:
# clean images: ie delete images that have no black pixels


for dataset in IMAGE_DATASETS:
    processed_folder = f"{IMAGES}/{dataset}_{'stretched' if STRETCH else 'padded'}_{SKELETONISE_METHOD}"
    image_paths = get_all_image_paths(processed_folder)

    list(map(clean_image, image_paths))

In [5]:
# process images

def process_image(image_path, stretch=STRETCH, skeletonise_method=SKELETONISE_METHOD, size=IMG_SIZE):
    """
    Universal image processing function:
    1. Binarise image
    2. Crop image to become square, scale to standardised dimensions
    3. Skeletonise image
    """

    image = img_as_ubyte(io.imread(image_path, as_gray=True))

    # Binarise image
    image = np.array([np.array([True if px != 255 else False for px in row]) for row in image])

    # Crop and scale image (stretching not allowed yet)
    image = crop_and_scale_image(image, False)

    # Scale image to standard size and skeletonise
    image = rescale_and_skeletonise_image_variant(image, size, stretch, skeletonise_method)

    io.imsave(image_path, img_as_ubyte(image))
    

for dataset in IMAGE_DATASETS:

    processed_folder = f"{IMAGES}/{dataset}_{'stretched' if STRETCH else 'padded'}_{SKELETONISE_METHOD}"

    image_paths = get_all_image_paths(processed_folder)
    image_paths = [i.replace("\\", "/") for i in image_paths]

    list(map(process_image, image_paths))




In [6]:
# get data into CSV

def get_complexity_and_meta(image_path):
    ip_split = image_path.split(".")[0].split("/")[-1].split("_")
    dataset = "_".join(image_path.split("/")[-2].split("_")[:-2])
    perimetric, pixel = perimetricComplexity(image_path)
    return ip_split, dataset, perimetric, pixel

def get_hanzi_image_complexity(image_path):
    ip_split, dataset, perimetric, pixel = get_complexity_and_meta(image_path)
    return ip_split[-3], ip_split[-2], ip_split[-1], 'stretch' if STRETCH else 'pad', SKELETONISE_METHOD, dataset, perimetric, pixel

def get_simph_image_complexity(image_path):
    ip_split, dataset, perimetric, pixel = get_complexity_and_meta(image_path)
    return ip_split[-2], "Simplified", ip_split[-1], 'stretch' if STRETCH else 'pad', SKELETONISE_METHOD, dataset, perimetric, pixel

def get_tradh_image_complexity(image_path):
    ip_split, dataset, perimetric, pixel = get_complexity_and_meta(image_path)
    return ip_split[-2], "Traditional", ip_split[-1], 'stretch' if STRETCH else 'pad', SKELETONISE_METHOD, dataset, perimetric, pixel

cld = pd.read_csv(CLD, index_col=0)
cld_characters = set(cld["Word"])

all_rows = []
columns = ["character", "period", "ID", "scale method", "skeletonise method", "dataset", "perimetric complexity", "pixel complexity"]

for dataset in IMAGE_DATASETS:

    folder = f"{IMAGES}/{dataset}_{'stretched' if STRETCH else 'padded'}_{SKELETONISE_METHOD}"
    images = listdir(folder)


    if dataset == "hanziyuan":
        func = get_hanzi_image_complexity
        cld_images = [f"{folder}/{image}" for image in images if image.split("_")[0] in cld_characters and image.split("_")[1] in PERIODS]
    elif dataset == "simplified_handwritten":
        func = get_simph_image_complexity
        cld_images = [f"{folder}/{image}" for image in images if image.split("_")[-2] in cld_characters]
    else:
        # dataset == "traditional_handwritten"
        func = get_tradh_image_complexity
        cld_images = [f"{folder}/{image}" for image in images if HanziConv.toSimplified(image.split("_")[-2]) in cld_characters]

    rows = list(map(func, cld_images))

    all_rows += rows
    df = pd.DataFrame(rows, columns=columns)
    csv_path = f"{DATA}/{dataset}_complexities.csv"
    if os.path.exists(csv_path):
        df1 = pd.read_csv(csv_path, index_col=0)
        df_out = pd.concat([df1, df], ignore_index=True)
        df_out.drop_duplicates().reset_index().to_csv(csv_path)
    else:
        df.to_csv(csv_path)

df = pd.DataFrame(all_rows, columns=columns)
csv_path = f"{DATA}/all_complexities.csv"
if os.path.exists(csv_path):
    df1 = pd.read_csv(csv_path, index_col=0)
    df_out = pd.concat([df1, df], ignore_index=True)
    df_out.drop_duplicates().reset_index().to_csv(csv_path)
else:
    df.to_csv(csv_path)