#### To convert data in VIA format to JSON format that the detection training code can read.

In [13]:
# import packages
from tqdm import tqdm
import json
import pandas as pd
import numpy as np
import os, sys
from typing import Any, Dict, List, Optional, Tuple, Union
import hashlib
import cv2
import random
random.seed(7)

In [None]:
'''
Example format of the labels.json file:

{
    "sample_img_01.png" = {
        'img_dimensions': (900, 600),
        'img_hash': "theimagedumpmyhash",
        'polygons': [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], ...]
     },
     "sample_img_02.png" = {
        'img_dimensions': (900, 600),
        'img_hash': "thisisahash",
        'polygons': [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], ...]
     }
     ...
}
'''

In [4]:
# helper functions

# via2df on ocr tokens
def via2df(via_tokens, exclude_list=[]):
    doc_ids, page_ids,page_nums, xs, ys, widths, heights, texts = [], [], [], [], [], [], [], []

    for key, value in tqdm(via_tokens['_via_img_metadata'].items()):
        page_id = value['filename']
        doc_id = page_id.rsplit('+', maxsplit=1)[0]
        page_num = page_id.rsplit('+', maxsplit=1)[1][:-4]
        page_id = doc_id + '+' + page_num
        if doc_id not in exclude_list:
            
            regions = value['regions']
            

            for region in regions:
                x = region['shape_attributes']['x']
                y = region['shape_attributes']['y']
                width = region['shape_attributes']['width']
                height = region['shape_attributes']['height']
                text = region['region_attributes']['text']

                doc_ids.append(doc_id)
                page_ids.append(page_id)
                page_nums.append(page_num)
                xs.append(x)
                ys.append(y)
                widths.append(width)
                heights.append(height)
                texts.append(text)
    
    data_tuple = list(zip(doc_ids,page_ids,page_nums,xs,ys,widths,heights,texts))
    token_df = pd.DataFrame(data_tuple, columns=['doc_ids','page_ids','page_num','x','y','width','height','text'])
    return token_df


# standardize datatypes
convert_dict = {'x':int,
            'y':int,
            'width':int,
            'height':int,
            'page_num':str}


In [2]:
# input and output files and directories

# TODO: change to the path to the via file
via_dir = '/home/mzhao/Data/work/DocAI/src/cba/data/us_dl/output_manually_annotated.json'

with open(via_dir) as data_file:
    via_file = json.load(data_file)

In [29]:
# input image file directory
# TODO: change to the path to the image file directory
img_dir = '/home/mzhao/Data/work/DocAI/src/cba/data/us_dl/images'
# output file directory for the json file
label_dir = '/home/mzhao/Data/work/DocAI/src/cba/data/us_dl/'


In [5]:
# convert via to dataframe
ocr_df = via2df(via_file)

100%|██████████| 125/125 [00:00<00:00, 38681.42it/s]


In [21]:
# get the document list from the annotation file
doc_list = ocr_df.page_ids.unique().tolist()

In [26]:
# train valid split
train_valid_split = 0.9
random.shuffle(doc_list)
train_list = doc_list[:int(len(doc_list)*train_valid_split)]
valid_list = doc_list[int(len(doc_list)*train_valid_split):]


In [38]:
# format the label file

def convert_labels(img_dir, doc_list, ocr_df, output_dir, label_type ='train'):

    labels: Dict[str, Any] ={}

    for filename in doc_list:
        attr: Dict[str, Any]={}

        # get img_dimensions
        img_path = os.path.join(img_dir,filename+'.png')
        img = cv2.imread(img_path)
        attr['img_dimensions'] = img.shape[:2]
        
        # get img_hash
        with open(img_path,'rb') as f:
            bytes = f.read()
            readable_hash = hashlib.sha256(bytes).hexdigest()
        attr['img_hash'] = readable_hash

        box_targets = ocr_df[ocr_df.page_ids == filename][['x','y','width','height']].values.tolist()
        box_targets = [
        [
        [box[0], box[1]],
        [box[0]+box[2], box[1]],
        [box[0]+box[2], box[1]+box[3]],
        [box[0], box[1]+box[3]],
        ] for box in box_targets
        ]
        attr['polygons'] = box_targets

        labels[filename+'.png'] = attr
        
    # print 1 example of the label file
    print(labels[doc_list[0]+'.png'])
    
    # dump the json file
    with open(os.path.join(output_dir, f'{label_type}_labels.json'),'w') as json_file:
        json.dump(labels, json_file)    

In [39]:
# get the labels
convert_labels(img_dir, train_list, ocr_df, label_dir, label_type='train')
convert_labels(img_dir, valid_list, ocr_df, label_dir, label_type='valid')

{'img_dimensions': (1200, 1600), 'img_hash': 'ba9548eed30e45c5321b5098c5480bdf5f3b702439e35a2ef90262b33bacb4f5', 'polygons': [[[802, 290], [1119, 290], [1119, 351], [802, 351]], [[132, 550], [217, 550], [217, 591], [132, 591]], [[185, 599], [437, 599], [437, 645], [185, 645]], [[404, 698], [468, 698], [468, 740], [404, 740]], [[114, 789], [195, 789], [195, 847], [114, 847]], [[209, 790], [318, 790], [318, 844], [209, 844]], [[336, 791], [473, 791], [473, 839], [336, 839]], [[627, 795], [734, 795], [734, 841], [627, 841]], [[751, 796], [881, 796], [881, 842], [751, 842]], [[204, 885], [302, 885], [302, 943], [204, 943]], [[311, 889], [448, 889], [448, 936], [311, 936]], [[579, 891], [678, 891], [678, 937], [579, 937]], [[438, 279], [784, 279], [784, 347], [438, 347]], [[516, 346], [757, 346], [757, 389], [516, 389]], [[766, 350], [1053, 350], [1053, 394], [766, 394]], [[1111, 395], [1267, 395], [1267, 435], [1111, 435]], [[1261, 393], [1321, 393], [1321, 437], [1261, 437]], [[1101, 431]