In [1]:
import cv2
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import numpy as np
import json
import random
import os
import re
import unicodedata
from tqdm.auto import tqdm
import uuid
from imagedegrade import np as degrade
import math


In [2]:
# Load JSON

def filter_label(label):
    if label is None:
        return False
    if label.isascii() == False:
        return False
    for char in "\n\r\xad\xa0":
        if char in label:
            return False
    return True

def degrade_image(image_array):
    image = degrade.blur(image_array, random.uniform(0.5, 0.75)).astype(np.uint8)
    # image = degrade.noise(image, random.randint(1, 2)).astype(np.uint8)
    # image = degrade.saltpepper(image, random.uniform(0.05, 0.08)).astype(np.uint8)
    image = degrade.jpeg(image, random.randint(85, 98)).astype(np.uint8)
    return image.astype(np.uint8)

def apply_wave_filter(img, sigma):
    img_output = np.zeros(img.shape, dtype=img.dtype)
    mean_color = img[0][0]
    rows, cols, _ = img.shape
    # Wave effect
    for i in range(rows):
        for j in range(cols):
            offset_y = 0
            offset_y = -int(sigma * math.sin(2 * 3.14 * j / cols))
            if i+offset_y < rows:
                img_output[i,j] = img[(i+offset_y)%rows,j]
            else:
                img_output[i,j] = mean_color
    return img_output

def generate_labels(
    n_key_map={
        "date": 3_000,
        "designation": 60_000,
        "variety": 5_000,
        "region": 2_000,
        "province": 5_000,
        "country": 1_000,
        "winery": 9_000
    }       
):
    wine_data = json.load(open("./assets/vin_db.json", encoding="utf-8"))
    key_list = ["title", "designation", "variety", "region_1", "province", "country", "winery"]
    wine_data = [{k:d[k] for k in key_list} for d in wine_data]
    for wine in wine_data: 
        wine["title"] = re.sub("[^0-9]", "", wine["title"]) # Remove non alpha numeric characters, we only want the dates
        wine["date"] = wine.pop("title") # Rename key
        wine["region"] = wine.pop("region_1") # Rename key
    labels = {}
    for key in wine_data[0].keys():
        labels[key] = list(filter(filter_label, [wine[key] for wine in wine_data] )) # Remove all odd values
        # labels[key] = list(map(lambda x: unicodedata.normalize("NFKD", re.sub(r'[^\x00-\x7F]', '', x).replace("\r\n", '').replace('\xad', '')), labels[key])) # Remove all \r and \n
        random.shuffle(labels[key]) # Shuffle
    for i in range(len(labels["designation"])): # split by 3 spaces and add remaning data at the end
        splitted_designtation = labels["designation"][i].split(' ')
        part1 = ' '.join(splitted_designtation[:3])
        part2 = ' '.join(splitted_designtation[3:])
        labels["designation"][i] = part1
        if type(part2) == str and len(part2) > 0:
            labels["designation"].append(part2)
    random.shuffle(labels["designation"]) # Shuffle again
    res_labels = []
    for key in n_key_map.keys(): # Cut to wanted length
        res_labels = res_labels + labels[key][:n_key_map[key]]
    random.shuffle(res_labels)
    return res_labels

def generate_images(labels, directory):
    font_files = os.listdir("./assets/fonts/")
    font_colors =  [
        (0, 0, 0), (135, 135, 135), (200, 200, 200), (255, 255, 255), (216, 35, 35), (172, 10, 10),
        (101, 20, 211), (211, 20, 69), (234, 169, 19), (255, 215, 0), (42, 0, 255), (125, 74, 164)
    ]
    bg_colors = [
        (255, 255, 255), (207, 207, 207), (243, 243, 243), (127, 127, 127), (199, 45, 45), (46, 6, 6),
        (100, 27, 27), (160, 38, 38), (23, 1, 1), (0, 0, 0)
    ]    
    if os.path.exists(directory) == False: os.makedirs(directory)    
    data_file = open(os.path.join(directory, "data.txt"), "a+")
    labels = labels
    for label in tqdm((labels)):
        try:
            font_color = random.choice(font_colors)
            bg_color =  random.choice(bg_colors)
            while (abs(sum(font_color) - sum(bg_color)) <= 80):
                font_color = random.choice(font_colors)
                bg_color =  random.choice(bg_colors)
            file_name = f"{uuid.uuid1()}.png"
            font_size = random.randint(25, 33)
            image_size = (int((len(label)) * font_size * 1.5), int(7 * font_size))
            image = Image.new(
                'RGB',
                image_size,
                bg_color
            )
            draw = ImageDraw.Draw(image)
            font = ImageFont.truetype("./assets/fonts/" + random.choice(font_files), font_size)
            draw.text(
                (image_size[0] / random.uniform(2, 10), image_size[1] / random.uniform(1.3, 10)),
                str(label),
                font=font,
                fill=font_color
            )
            image.save(os.path.join(directory, file_name))
            image = degrade_image(np.asarray(image))
            image = apply_wave_filter(image, random.uniform(-25, 25))
            image = Image.fromarray(image)
            # Crop image to get ride of the side waves
            width, height = image.size
            image = image.crop((3, 3, width - 3, height - 3))
            image.save(os.path.join(directory, file_name))
            data_file.write(f"{file_name} {label}\n")            
        except:
            None
    data_file.close()

labels = generate_labels()
# generate_images(labels, './data')



