In [1]:
import numpy as np
import cv2
import os
import pandas as pd
import h5py

In [2]:
train_folder = "./train"
test_folder = "./test"
extra_folder = "./extra"
resize_size = (64, 64)

In [3]:
def collapse_col(row):
    global resize_size
    new_row = {}
    new_row['img_name'] = list(row['img_name'])[0]
    new_row['labels'] = row['label'].astype(np.str).str.cat(sep='_')
    new_row['top'] = max(int(row['top'].min()), 0)
    new_row['left'] = max(int(row['left'].min()), 0)
    new_row['bottom'] = int(row['bottom'].max())
    new_row['right'] = int(row['right'].max())
    new_row['width'] = int(new_row['right'] - new_row['left'])
    new_row['height'] = int(new_row['bottom'] - new_row['top'])
    new_row['num_digits'] = len(row['label'].values)
    return pd.Series(new_row, index=None)

In [4]:
def get_name(index, hdf5_data):
    name = hdf5_data['/digitStruct/name']
    return ''.join([chr(v[0]) for v in hdf5_data[name[index][0]].value])

In [5]:
def get_bbox(index, hdf5_data):
    attrs = {}
    item = hdf5_data['digitStruct']['bbox'][index].item()
    for key in ['label', 'left', 'top', 'width', 'height']:
        attr = hdf5_data[item][key]
        values = [hdf5_data[attr.value[i].item()].value[0][0]
                  for i in range(len(attr))] if len(attr) > 1 else [attr.value[0][0]]
        attrs[key] = values
    return attrs

In [6]:
def getFilePathInfo(filepath):

    dirname = os.path.dirname(filepath)
    basename = os.path.basename(filepath)
    info = os.path.splitext(basename)
    filename = info[0]
    extend = info[1]
 
    return dirname, filename, extend

In [7]:
def img_boundingbox_data_constructor(mat_file):
    info = getFilePathInfo(mat_file)
    bbox_csv_filepath = os.path.join(info[0], 'bbox.csv')
    print(bbox_csv_filepath)

    if os.path.isfile(bbox_csv_filepath):
        print("bounding box file: {} exists, read from file".format(bbox_csv_filepath))
        bbox_df = pd.read_csv(bbox_csv_filepath)
        # print(bbox_df)
        print('finished image bounding box data construction...')
        return bbox_df

    f = h5py.File(mat_file, 'r')
    all_rows = []
    print('image bounding box data construction starting...')
    bbox_df = pd.DataFrame([], columns=['height', 'img_name', 'label', 'left', 'top', 'width'])

    i = 0
    for j in range(f['/digitStruct/bbox'].shape[0]):
        img_name = get_name(j, f)
        row_dict = get_bbox(j, f)
        row_dict['img_name'] = img_name
        if (i % 1000) == 0:
            print(row_dict)
        all_rows.append(row_dict)
        bbox_df = pd.concat([bbox_df, pd.DataFrame.from_dict(row_dict,orient = 'columns')])
        # if j > 20:
        #     break
        i += 1

    bbox_df['bottom'] = bbox_df['top']+bbox_df['height']
    bbox_df['right'] = bbox_df['left']+bbox_df['width']
    bbox_df.to_csv(bbox_csv_filepath, index = False)
    print(bbox_df)
    
    print('finished image bounding box data construction...')
    return bbox_df

In [8]:
def construct_all_data(img_folder, mat_file_name, h5_name):
    img_bbox_data = img_boundingbox_data_constructor(os.path.join(img_folder, mat_file_name))
    print("\nimg_bbox_data:\n{}".format(img_bbox_data))

    bbox_groupby_name_csv_filepath = os.path.join(img_folder, 'bbox_groupby_name.csv')
    if os.path.isfile(bbox_groupby_name_csv_filepath):
        print("bounding box groupby name file: {} exists, read from file".format(bbox_groupby_name_csv_filepath))
        img_bbox_data_grouped = pd.read_csv(bbox_groupby_name_csv_filepath)
    else:
        img_bbox_data_grouped = img_bbox_data.groupby('img_name').apply(collapse_col) 
        img_bbox_data_grouped.to_csv(bbox_groupby_name_csv_filepath, index = False)
    print("\nimg_bbox_data groupby name:\n{}".format(img_bbox_data_grouped))
    print("\nimg_bbox_data groupby name ===========END===========")

    img_data = image_data_constuctor(img_folder, img_bbox_data_grouped)
    print('done constructing main dataframes...starting grouping')

    return
    df1 = img_bbox_data_grouped.merge(img_data, on='img_name', how='left')
    print('grouping done')
    #df1.to_csv(os.path.join(img_folder, csv_name), index = False)
    df1.to_hdf(os.path.join(img_folder, h5_name),'table')

In [9]:
def image_data_constuctor(img_folder, img_bbox_data):
    image_data_csv_filepath = os.path.join(img_folder, 'image_data.csv')
    if os.path.isfile(image_data_csv_filepath):
        print("image data file: {} exists, read from file".format(image_data_csv_filepath))
        img_data = pd.read_csv(image_data_csv_filepath)
        return
    
    print('image data construction starting...')
    imgs = []
    i = 0
    for img_file in os.listdir(img_folder):
        if img_file.endswith('.png'):
            imgs.append([img_file, cv2.imread(os.path.join(img_folder, img_file))])
            if (i % 1000) == 0:
                print(img_file)
            i += 1

    img_data = pd.DataFrame([], columns=['img_name','img_height','img_width','img','cut_img'])
    # print("Images: {}".format(imgs[0]))
    print('finished loading images...starting image processing...')

    i = 0
    for img_info in imgs:
        if (i % 1000) == 0:
            print(img_info[0])
        i += 1
        row = img_bbox_data[img_bbox_data['img_name']==img_info[0]]
        # print('row: {}'.format(row))
        full_img = img_info[1] #cv2.normalize(cv2.cvtColor(cv2.resize(img_info[1], resize_size), cv2.COLOR_BGR2GRAY).astype(np.float64), 0, 1, cv2.NORM_MINMAX)
        cut_img = full_img.copy()[int(row['top']): int(row['top']+row['height']), int(row['left']): int(row['left']+row['width']), ...]
        row_dict = {'img_name':[img_info[0]], 'img_height':[img_info[1].shape[0]], 'img_width':[img_info[1].shape[1]], 'img':[full_img], 'cut_img':[cut_img]}
        img_data = pd.concat([img_data, pd.DataFrame.from_dict(row_dict, orient = 'columns')])
    
    print('finished image processing...')
    print(img_data)
    img_data.to_csv(image_data_csv_filepath, index = False)
    
    return img_data

In [10]:
construct_all_data(train_folder,'digitStruct.mat', 'train_data_processed.h5')
# construct_all_data(test_folder,'digitStruct.mat', 'test_data_processed.h5')
# construct_all_data(extra_folder,'digitStruct.mat', 'extra_data_processed.h5') #takes a long time

./train\bbox.csv
bounding box file: ./train\bbox.csv exists, read from file
finished image bounding box data construction...

img_bbox_data:
       height   img_name  label   left   top  width  bottom  right
0       219.0      1.png    1.0  246.0  77.0   81.0   296.0  327.0
1       219.0      1.png    9.0  323.0  81.0   96.0   300.0  419.0
2        32.0      2.png    2.0   77.0  29.0   23.0    61.0  100.0
3        32.0      2.png    3.0   98.0  25.0   26.0    57.0  124.0
4        15.0      3.png    2.0   17.0   5.0    8.0    20.0   25.0
...       ...        ...    ...    ...   ...    ...     ...    ...
73252    40.0  33401.png    2.0   34.0   6.0   25.0    46.0   59.0
73253    40.0  33401.png    2.0   61.0   4.0   25.0    44.0   86.0
73254    25.0  33402.png    1.0   35.0  10.0    7.0    35.0   42.0
73255    25.0  33402.png    6.0   44.0   8.0   15.0    33.0   59.0
73256    25.0  33402.png    9.0   62.0   9.0   17.0    34.0   79.0

[73257 rows x 8 columns]
bounding box groupby name fil