In [None]:
import json
import splitfolders

from PIL import Image
from pdf2image import convert_from_path
import shutil
import pathlib

import os
from os import listdir

from Trail import Trail
from image_preprocess import get_image_name, text_from_images_in_folder

# zakladni umisteni dat: nevejdou se mi na disk, toto je adresa externÃ­ho uloziste
KNIHOVNIK='/Volumes/Documents/BAR/skola/diplomka/image_classification/data/'

# sem se ukladaji resizovane obrazky
DATA_DIR='./data/resized_images'

# sem se ukladaji olabelovane obrazky
SORTED_DIR=KNIHOVNIK + 'labeled_images'

# sem train-test split rozhodi trenovaci a testovaci sadu
TRAIN_TEST_DIR = 'data/output'

#sifry vytahane z databanky rucne rozhozene na easy/hard
COURSES_DIR = KNIHOVNIK+'sifry_z_databanky'

#sifry vytahane z rocniku DNEM rucne rozhozene na easy/hard
DNEM_DIR = KNIHOVNIK+'DNEM'

TRAILS_DIR=KNIHOVNIK+'trails'

#vsechny sifry v original velikosti pro lepsi extrakci textu
ORIGINALS_DIR=KNIHOVNIK+'images'

LABELS=['easy', 'hard']

IMG_SIZE=512
COUNTER = 0

In [None]:
with open("data/trail_dict_shortcuts.json") as f:
    trail_dict=json.load(f)
my_trails=dict()

## Image preprocessing

In [None]:
def get_label(img_name, threshold, print_table=False):
    #get game name and task number    
    my_name, _ =get_image_name(img_name)
    my_split=my_name.split("_")
    trail_id=my_split[0]
    task_no = int(my_split[1])
    
    if trail_id in my_trails:
        trail = my_trails[trail_id]
    else:    
        trail_name=trail_dict[trail_id]
        trail = Trail(trail_name)
        if print_table:
            print("\\hline")
            print("\\hline")
            print("\multicolumn{4}{c}{hra " + trail.name+"}  \\\\ \hline")

        my_peaks=trail.get_peaks()
        my_trails[trail_id]=trail
    
    peaks = trail.get_peaks()
    my_peak = peaks[task_no-1]
    if my_peak > threshold:
        label='hard'
    else:
        label='easy'
           
    if os.path.exists(TRAIN_TEST_DIR + '/val' + '/'+label+'/'+img_name):
        sada = 'val'
    else:
        sada = 'train'
    if print_table:
        print(str(task_no) + ' & ' + str(round(my_peak/60, 1)) + ' & ' + label + ' & ' + sada + ' \\\\')
    return label

In [None]:
def resize_image(img_name, size, source_dir, threshold, print_table, counter=0, given_label='easy'):
    img_path= source_dir + "/" +img_name
    resized_path=DATA_DIR

    if threshold == 0:
        label=given_label
    else:
        label=get_label(img_name, threshold, print_table)

    if not print_table:
        label_path = resized_path+"/"+label  

        if not os.path.exists(label_path):
            os.makedirs(label_path)

        image=Image.open(img_path)
        resized_image=image.resize((size, size))
        resized_image.save(label_path+"/"+img_name, "PNG")
    
    counter = counter+1
    return counter

In [None]:
def label_image(img_name, source_dir, threshold, print_table, counter=0, given_label='easy'):
    img_path= source_dir + "/" +img_name
    resized_path=SORTED_DIR

    if threshold == 0:
        label=given_label
    else:
        label=get_label(img_name, threshold, print_table)

    if not print_table:
        label_path = resized_path+"/"+label  

        if not os.path.exists(label_path):
            os.makedirs(label_path)

        image=Image.open(img_path)
        image.save(label_path+"/"+img_name, "PNG")
    
    counter = counter+1
    return counter

In [None]:
def convert_pdf_to_png(img_name, tmp_dir, source_dir, save_converted_image=True):
    img_path=source_dir+'/'+img_name
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)        
        
    # get rid of the .pdf
    # and add new format
    my_split=img_name.split(".")
    new_image_name=my_split[0]+".png"
    tmp_path=tmp_dir+'/'+new_image_name
    
    page = convert_from_path(img_path)
    page[0].save(tmp_path, 'PNG')  
    
    if save_converted_image:
        if not os.path.exists(ORIGINALS_DIR):
            os.makedirs(ORIGINALS_DIR)
        page[0].save(ORIGINALS_DIR+'/'+new_image_name, 'PNG')
    
    return new_image_name

In [None]:
def process_imgs_with_label(my_source_dir, counter, print_table):
    threshold = 0
    for l in LABELS:
        source_dir = my_source_dir + '/' + l
        counter=sort_images(source_dir, l, threshold, counter, print_table)
    return counter

In [None]:
'''
Take image, convert to pdf if necessary and label.
'''
def sort_images(source_dir, given_label, threshold=0, counter=0, print_table=False):
    tmp_dir='tmp'
    for image in sorted(os.listdir(source_dir)):
        if (image.endswith(".pdf")):
            new_image_name=convert_pdf_to_png(image, tmp_dir, source_dir)
            counter = label_image(new_image_name, tmp_dir, threshold, print_table, counter, given_label)
        elif(image.startswith(".")):
            continue
        else:
            counter = label_image(image, source_dir, threshold, print_table, counter, given_label)
    print("-------- counter: " + str(counter) + " ----------")
    return counter

In [None]:
'''
Take image, convert to pdf if necessary, shrink to img_size x img_size and label.
'''
def process_images(img_size, source_dir, given_label, threshold=0, counter=0, print_table=False):
    tmp_dir='tmp'
    for image in sorted(os.listdir(source_dir)):
        if (image.endswith(".pdf")):
            new_image_name=convert_pdf_to_png(image, tmp_dir, source_dir)
            counter = resize_image(new_image_name, img_size, tmp_dir, threshold, print_table, counter, given_label)
        elif(image.startswith(".")):
            continue
        else:
            counter = resize_image(image, img_size, source_dir, threshold, print_table, counter, given_label)
    print("-------- counter: " + str(counter) + " ----------")
    return counter

In [None]:
'''
Go through all images in folder, convert to pdf if necessary, shrink to img_size x img_size and label.
'''
counter=0
tmp_dir = 'tmp'
print_table=False

'''
Works for trail images that have data from games and can be labeled automaticaly.
'''
# time limit for a task to be easy => 18 minutes
threshold = 18*60
given_label=None
counter=sort_images(TRAILS_DIR, given_label, threshold, counter, print_table)

'''
Convert and resize manually labeled data
'''
counter=process_imgs_with_label(COURSES_DIR, counter, print_table)
counter=process_imgs_with_label(DNEM_DIR, counter, print_table)
        
if os.path.exists(tmp_dir):
    shutil.rmtree(tmp_dir)


## Split folders to train/test

In [None]:
# splits into three folders, test folder is empty
if os.path.exists(TRAIN_TEST_DIR):
    shutil.rmtree(TRAIN_TEST_DIR)
splitfolders.ratio(SORTED_DIR, output=TRAIN_TEST_DIR, ratio=(.8, 0.2))

## Convert all images to png

In [None]:
source_dir=ORIGINALS_DIR
for image in os.listdir(source_dir):
    if (image.endswith(".pdf")):
        new_image_name=convert_pdf_to_png(image, source_dir, source_dir)

## Extract text from images

In [None]:
import pytesseract
from pytesseract import Output

texts=dict()

source_dir=ORIGINALS_DIR
for image in os.listdir(source_dir):
    my_name,_=get_image_name(image)    
    img_path=source_dir+"/"+image
    texts[my_name]=pytesseract.image_to_string(img_path)

In [None]:
filename='data/extracted_texts.json'
with open(filename, "w") as outfile:
    json.dump(texts, outfile)