In [None]:
#default_exp ocr_dataset_brno

# BRNO

> [image] -> [text]

download dataset from: https://pero.fit.vutbr.cz/brno_mobile_ocr_dataset -> cropped text lines  
dir structure:  
`
./data/brno/
    | lines/
        | test.eazy
        ...
        lines/
            | ....jpg
            ...
`
1. copy files from `./data/brno/lines/lines/` to `./data/ocr/brno_lines/`

In [None]:
#export
from ocr.core import *
from ocr.core import save_dict, read_dict
from fastai import *
from fastai.vision import *
import pandas as pd
import numpy as np
import cv2
from tqdm.notebook import tqdm
from pathlib import PosixPath

In [None]:
#export
class brno_ocr_config:
    LINE_HEIGHT = 48
    MAIN_DIR = PosixPath('../data/brno/')
    LINES_DIR = PosixPath('../data/ocr/brno_lines/')
    FILE_PATH = PosixPath('../data/ocr/brno_gt.pickle')
    PAD = 0 # only for training
    TEST_PAD = 0 # optional, can be set to zero

In [None]:
paths = map(lambda p: brno_ocr_config.LINES_DIR/p, os.listdir(brno_ocr_config.LINES_DIR))
filenames_df = pd.DataFrame(paths, columns=['image_path'])
filenames_df.head()

Unnamed: 0,image_path
0,../data/ocr/brno_lines/385ff8bcbdd78e3d0dfdeac...
1,../data/ocr/brno_lines/59cd3ea8d5ce8a8dc68f3b2...
2,../data/ocr/brno_lines/5124eae8ab02cc2969d8e60...
3,../data/ocr/brno_lines/9926efeb1d632fd62906383...
4,../data/ocr/brno_lines/efb03b607fb7519922009e9...


In [None]:
#export
def create_df():
    data = []
    for data_split in ['train', 'valid', 'test']:
        if data_split == 'test': continue
        for difficulty in ['easy', 'medium', 'hard']:
            data_info = data_split +'.'+ difficulty
            with open(brno_ocr_config.MAIN_DIR/'lines'/data_info, "r") as f:
                lines = f.readlines()
                for line in lines:
                    fn,*string = line[:-1].split(' ')
                    string = ' '.join(string)
                    fn = brno_ocr_config.LINES_DIR/fn
                    data.append((fn, string, 'valid' in data_info, 'brno_'+difficulty))
    return pd.DataFrame(data, columns=['image_path', 'string', 'valid', 'dataset'])

In [None]:
create_df()

Unnamed: 0,image_path,string,valid,dataset
0,../data/ocr/brno_lines/18256865b4b14a458cd50e1...,on-board the USS CORONADO; a deployable CMOC,False,brno_easy
1,../data/ocr/brno_lines/d2ac2da7d29d2384f6ff36e...,unemployment insurance.,False,brno_easy
2,../data/ocr/brno_lines/55ef7096ba457a8990e05da...,bound was calculated. The upper bound was calc...,False,brno_easy
3,../data/ocr/brno_lines/145c6a2d996117b64538681...,"process elements, so that the requestor agent",False,brno_easy
4,../data/ocr/brno_lines/30b35d49bf5250d675bdb8e...,we have to skolemise both using the same skole...,False,brno_easy
...,...,...,...,...
455830,../data/ocr/brno_lines/fcd4b3e2e158cbe48bc20f8...,"cured in Milwaukee, WI, in 1993 where a harmfu...",True,brno_hard
455831,../data/ocr/brno_lines/fb8c3be8a0f030f8f23db3a...,"F-measure (Hatzivassiloglou and McKeown, 1993;",True,brno_hard
455832,../data/ocr/brno_lines/c0e71aab6644bc428f6b1cb...,Gene ontology (GO),True,brno_hard
455833,../data/ocr/brno_lines/d5584385f9a98562055cc23...,buffered at pH 8.2 with borate buffer through 100,True,brno_hard
