In [None]:
#default_exp textbox_dataset_sroie2019

# SROIE 2019

> Creates df of image filenames and dict[filename] -> text coords arr[8]

download SROIE 2019 dataset from https://rrc.cvc.uab.es/
1. split txt, jpg files from dir `0325updated.task1train(626p)` -> `./data/sroie2019/train_gt`, `./data/sroie2019/train_img`
2. move dir `text.task1_2-test（361p)` -> `./data/sroie2019/test_gt`
3. move dir `task1_2_test(361p)` -> `./data/sroie2019/test_img`
4. split txt, jpg files from dir `0325updated.task2train(626p)` -> `./data/sroie2019/task3_train_gt`, `./data/sroie2019/task3_train_img`
5. move dir `task3-test（347p)` -> `./data/sroie2019/task3_test_img`
6. make sure that every image has only one gt file with the same name!

In [None]:
#export
from ocr.core import save_dict, read_dict, plot
from fastai import *
from fastai.vision import *
import pandas as pd
import numpy as np
import cv2
from tqdm.notebook import tqdm

In [None]:
#export
class sroie_textbox_config:
    MAIN_DIR = '../data/sroie2019/'
    FILE_DIR = '../data/textbox/sroie2019bbox.pickle'

In [None]:
#export
def read_data(csv_path='images/X00016469670.txt'):
    ''' returns [([4,2], str),...] '''
    out = []
    with open(csv_path, encoding="utf8") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for line in csv_reader:
            if len(line) > 8:
                x1, y1, x2, y2, x3, y3, x4, y4 = list(map(int, line[:8]))
                label = ','.join(line[8:])
                points = np.array([ [x1, y1], [x2, y2], [x3, y3], [x4, y4] ])
                out.append([points, label])
    return out

In [None]:
filename2bbs = defaultdict(lambda: [])
total = 0

for mode in ['train', 'test']:
    filenames = os.listdir(os.path.join(sroie_textbox_config.MAIN_DIR, mode + '_img'))
    for fn in tqdm(filenames, total=len(filenames)):
        gt = read_data(os.path.join(sroie_textbox_config.MAIN_DIR, mode + '_gt', fn[:-3] + 'txt'))
        im = cv2.imread(os.path.join(sroie_textbox_config.MAIN_DIR, mode + '_img', fn[:-3] + 'jpg'))
        bboxes = []
        for i, (points, label) in enumerate(gt):
            _min, _max = np.min(points, axis=0), np.max(points, axis=0)
            _min[0], _min[1] = max(_min[0], 0), max(_min[1], 0)
            _max[0], _max[1] = min(_max[0], im.shape[1]), min(_max[1], im.shape[0])
            bboxes.append(( _min[1], _min[0], _max[1], _max[0] )) # [y1,x1, y2,x2]
            total += 1
#             bbs.append(points)

        filename2bbs[fn] = (bboxes, ['line'] * len(bboxes))
    
print('total bboxes:', total)
save_dict(dict(filename2bbs), sroie_textbox_config.FILE_DIR)

HBox(children=(IntProgress(value=0, max=703), HTML(value='')))




HBox(children=(IntProgress(value=0, max=344), HTML(value='')))


total bboxes: 55922


In [None]:
#export
def get_filename2bboxes_dict():
    return read_dict(sroie_textbox_config.FILE_DIR)

In [None]:
filename2bbs = get_filename2bboxes_dict()
bbs, labels = filename2bbs['X51006327978.jpg']
bbs[:5], labels[:5]

([(249, 55, 286, 472),
  (290, 54, 324, 641),
  (332, 54, 367, 502),
  (372, 55, 405, 312),
  (414, 55, 445, 432)],
 ['line', 'line', 'line', 'line', 'line'])

In [None]:
#export
def create_df():
    data = []
    for mode in ['train', 'test']:
        path = os.path.join(sroie_textbox_config.MAIN_DIR, mode + '_img')
        for fn in os.listdir(path):
            data.append((os.path.join(path, fn), mode == 'test', 'sroie2019'))
    return pd.DataFrame(data, columns=['image_path', 'valid', 'dataset'])

In [None]:
create_df()

Unnamed: 0,image_path,valid,dataset
0,../data/sroie2019/train_img/X51006327978.jpg,False,sroie2019
1,../data/sroie2019/train_img/X51006619550.jpg,False,sroie2019
2,../data/sroie2019/train_img/X51005361883.jpg,False,sroie2019
3,../data/sroie2019/train_img/X51008123451.jpg,False,sroie2019
4,../data/sroie2019/train_img/X51005361900.jpg,False,sroie2019
...,...,...,...
1042,../data/sroie2019/test_img/X51006619564.jpg,True,sroie2019
1043,../data/sroie2019/test_img/X51005605287.jpg,True,sroie2019
1044,../data/sroie2019/test_img/X51005568894.jpg,True,sroie2019
1045,../data/sroie2019/test_img/X51005677333.jpg,True,sroie2019
