In [168]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import cv2
import pandas as pd
import json
from datetime import datetime

from PIL import Image

from pytesseract import Output
import pytesseract
import tensorflow as tf
import shutil
import os
from tqdm.notebook import tqdm

# Functions

In [179]:
def read_image(image_path,filename, pytesseract_path):
    # read image, convert to grayscale and apply Otsu threshold
    pytesseract.pytesseract.tesseract_cmd = pytesseract_path
    img = cv2.imread(image_path+filename)
    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (1, 1), 0)
    kernel = np.ones((2,2),np.uint8)
    opening = cv2.morphologyEx(blurred, cv2.MORPH_OPEN, kernel)
    img = cv2.threshold(opening, 0, 255, cv2.THRESH_BINARY_INV+ cv2.THRESH_OTSU)[1]
    data = pytesseract.image_to_boxes(img, lang='eng')
    return data
        
def find_cordes_from_data(data):
    ## Data to list of lists with coordinates
    boxes_lst = data.splitlines()
    cordes = []
    for box in boxes_lst:
        cors = []
        for cor in box.split()[0:5]: #
            try:
                cor = int(cor)
            except:
                cor = cor
            cors.append(cor)
        cordes.append(cors)
    return cordes

def find_phrase_from_cordes(cordes):
    ## Keep only the phrase
    phrase_lst = []
    copy_cordes = cordes.copy()
    for cord in copy_cordes:
        if cord[0] != "~" and cord[0] != ":":
            phrase_lst.append(str(cord[0]))
        else:
            cordes.remove(cord)
    phrase = "".join(phrase_lst)
    phrase = phrase.replace("~","").replace(":", "")
    return phrase

def find_new_corde(phrase, identity, cordes):
    ## Filter the 'PRENOM' from phrase and keep the rest
    is_succesful =""
    pres = ["PRENOM", "NOM"]
    new_corde = []
    for pre in pres:
        if (pre in phrase):
            if (len(pre) + len(identity) == len(phrase)):
                new_corde = cordes[len(pre):]
                is_succesful = True
        else:
            if len(identity) == len(phrase):
                new_corde = cordes
                is_succesful = True

    if is_succesful== "":
        is_succesful = False
    return new_corde, is_succesful

def create_coor_dict(new_corde, is_succesful,filename, identity, letter_coor_dict,image_path):
    if is_succesful == True:
        img_pilow = Image.open(image_path+filename)
        temp_list = []
        for idx, corde in enumerate(new_corde):
            temp_dict = {
                "letter": identity[idx],
                "left": corde[1],
                "upper": img_pilow.size[1]-corde[4],
                "right": corde[3],
                "lower": img_pilow.size[1]-corde[2],
            }
            temp_list.append(temp_dict)
        name = filename.replace('.jpg', '')
        letter_coor_dict[name] = temp_list
    return letter_coor_dict

    
def save_coor_dict(letter_coor_dict,data_path, type):
    timestamp = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
    with open(f"{data_path}/{type}_{timestamp}.json", "w") as outfile:
        json.dump(letter_coor_dict, outfile)
            

# Process

In [190]:
type = "train"
image_path = f"C:/Users/Intel PC G5900/Desktop/{type}_v2/{type}/"
csv_path = f"data/Csv/written_name_{type}_v2.csv"
data_path = "data/"
pytesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
df = pd.read_csv(csv_path)
df = df[df['IDENTITY'].notnull()].reset_index(drop=True)
dict = df.to_dict(orient='records') 

In [191]:
letter_coor_dict = {}
counter_succ = 0
counter = 0
for pair in tqdm(dict[30001:60000]):
    filename = pair['FILENAME']
    identity = pair['IDENTITY']
    data = read_image(image_path,filename, pytesseract_path) #, image_pil
    cordes = find_cordes_from_data(data)
    phrase = find_phrase_from_cordes(cordes)
    new_corde, is_succesful = find_new_corde(phrase, identity, cordes)
    if is_succesful == True:
        counter_succ = counter_succ + 1
    counter = counter + 1
    print(filename, "///", identity, "///", is_succesful, "///", phrase, "///", round(counter_succ/counter, 3))

    letter_coor_dict = create_coor_dict(new_corde, is_succesful,filename, identity, letter_coor_dict,image_path)

  0%|          | 0/29999 [00:00<?, ?it/s]

TRAIN_30054.jpg /// DORIL /// True /// POETT /// 1.0
TRAIN_30055.jpg /// BOMRGOIN /// True /// Beungoif /// 1.0
TRAIN_30056.jpg /// MIKAILCAN /// False /// ?m;ﬂIKAILCHtl /// 0.667
TRAIN_30057.jpg /// LAURIANE /// True /// LAURYANE /// 0.75
TRAIN_30058.jpg /// EDOUARD /// True /// EDOYARD /// 0.8
TRAIN_30059.jpg /// ELLIOT /// True /// ELLiOT /// 0.833
TRAIN_30060.jpg /// DELCROIX /// False /// DtNOMDECCROTX /// 0.714
TRAIN_30061.jpg /// WISPINE /// False ///  /// 0.625
TRAIN_30062.jpg /// PLOUZENNEC /// False /// DNLNOMPLOUZeNNEC /// 0.556
TRAIN_30063.jpg /// JULIETTE /// True /// TULTETTE /// 0.6
TRAIN_30064.jpg /// ZIRAK /// False /// zZ10AK /// 0.545
TRAIN_30065.jpg /// JOM /// False ///  /// 0.5
TRAIN_30066.jpg /// CLEMENT /// True /// tLidenw /// 0.538
TRAIN_30067.jpg /// PAULINE /// False /// SORENOMTAU.1NF /// 0.5
TRAIN_30068.jpg /// RAPHAEL /// True /// RADH&KL /// 0.533
TRAIN_30069.jpg /// JADE /// True /// JADE /// 0.562
TRAIN_30070.jpg /// MIGUEL MANUEL /// False /// HIGUELM

In [192]:
len(letter_coor_dict)

14135

In [193]:
save_coor_dict(letter_coor_dict,data_path, type)

In [None]:
with open('sample.json', 'r') as openfile:
 
    # Reading from json file
    json_object = json.load(openfile)