In [4]:
import numpy as np
import cv2
import os
import pandas as pd
import h5py

In [5]:
train_folder = "./train"
test_folder = "./test"
extra_folder = "./extra"
resize_size = (64,64)

In [6]:
def collapse_col(row):
    global resize_size
    new_row = {}
    new_row['img_name'] = list(row['img_name'])[0]
    new_row['labels'] = row['label'].astype(np.str).str.cat(sep='_')
    new_row['top'] = max(int(row['top'].min()),0)
    new_row['left'] = max(int(row['left'].min()),0)
    new_row['bottom'] = int(row['bottom'].max())
    new_row['right'] = int(row['right'].max())
    new_row['width'] = int(new_row['right'] - new_row['left'])
    new_row['height'] = int(new_row['bottom'] - new_row['top'])
    new_row['num_digits'] = len(row['label'].values)
    return pd.Series(new_row,index=None)

In [7]:
def image_data_constuctor(img_folder, img_bbox_data):
    print('image data construction starting...')
    imgs = []
    for img_file in os.listdir(img_folder):
        if img_file.endswith('.png'):
            imgs.append([img_file,cv2.imread(os.path.join(img_folder,img_file))])
    img_data = pd.DataFrame([],columns=['img_name','img_height','img_width','img','cut_img'])
    print('finished loading images...starting image processing...')
    for img_info in imgs:
        row = img_bbox_data[img_bbox_data['img_name']==img_info[0]]
        full_img = img_info[1] #cv2.normalize(cv2.cvtColor(cv2.resize(img_info[1],resize_size), cv2.COLOR_BGR2GRAY).astype(np.float64), 0, 1, cv2.NORM_MINMAX)
        cut_img = full_img.copy()[int(row['top']):int(row['top']+row['height']),int(row['left']):int(row['left']+row['width']),...]
        row_dict = {'img_name':[img_info[0]],'img_height':[img_info[1].shape[0]],'img_width':[img_info[1].shape[1]],'img':[full_img],'cut_img':[cut_img]}
        img_data = pd.concat([img_data,pd.DataFrame.from_dict(row_dict,orient = 'columns')])
    print('finished image processing...')
    return img_data

In [8]:
def get_name(index, hdf5_data):
    name = hdf5_data['/digitStruct/name']
    return ''.join([chr(v[0]) for v in hdf5_data[name[index][0]].value])

In [9]:
def get_bbox(index, hdf5_data):
    attrs = {}
    item = hdf5_data['digitStruct']['bbox'][index].item()
    for key in ['label', 'left', 'top', 'width', 'height']:
        attr = hdf5_data[item][key]
        values = [hdf5_data[attr.value[i].item()].value[0][0]
                  for i in range(len(attr))] if len(attr) > 1 else [attr.value[0][0]]
        attrs[key] = values
    return attrs

In [10]:
def img_boundingbox_data_constructor(mat_file):
    f = h5py.File(mat_file,'r') 
    all_rows = []
    print('image bounding box data construction starting...')
    bbox_df = pd.DataFrame([],columns=['height','img_name','label','left','top','width'])
    i = 0
    for j in range(f['/digitStruct/bbox'].shape[0]):
        img_name = get_name(j, f)
        row_dict = get_bbox(j, f)
        row_dict['img_name'] = img_name
        if (i % 1000) == 0:
            print(row_dict)
        i += 1
        all_rows.append(row_dict)
        bbox_df = pd.concat([bbox_df,pd.DataFrame.from_dict(row_dict,orient = 'columns')])
    
    bbox_df['bottom'] = bbox_df['top']+bbox_df['height']
    bbox_df['right'] = bbox_df['left']+bbox_df['width']
    print('finished image bounding box data construction...')
    return bbox_df

In [11]:
def construct_all_data(img_folder,mat_file_name,h5_name):
    img_bbox_data = img_boundingbox_data_constructor(os.path.join(img_folder,mat_file_name))
    print("\nimg_bbox_data:\n{}".format(img_bbox_data))
    img_bbox_data_grouped = img_bbox_data.groupby('img_name').apply(collapse_col) 
    img_data = image_data_constuctor(img_folder, img_bbox_data_grouped)
    print('done constructing main dataframes...starting grouping')
    df1 = img_bbox_data_grouped.merge(img_data,on='img_name',how='left')
    print('grouping done')
    #df1.to_csv(os.path.join(img_folder,csv_name), index = False)
    df1.to_hdf(os.path.join(img_folder,h5_name),'table')

In [None]:
construct_all_data(train_folder,'digitStruct.mat','train_data_processed.h5')
# construct_all_data(test_folder,'digitStruct.mat','test_data_processed.h5')
# construct_all_data(extra_folder,'digitStruct.mat','extra_data_processed.h5') #takes a long time

image bounding box data construction starting...
{'label': [1.0, 9.0], 'left': [246.0, 323.0], 'top': [77.0, 81.0], 'width': [81.0, 96.0], 'height': [219.0, 219.0], 'img_name': '1.png'}


  return ''.join([chr(v[0]) for v in hdf5_data[name[index][0]].value])
  values = [hdf5_data[attr.value[i].item()].value[0][0]
  for i in range(len(attr))] if len(attr) > 1 else [attr.value[0][0]]


{'label': [6.0], 'left': [36.0], 'top': [4.0], 'width': [14.0], 'height': [31.0], 'img_name': '1001.png'}
{'label': [3.0], 'left': [37.0], 'top': [5.0], 'width': [14.0], 'height': [34.0], 'img_name': '2001.png'}
{'label': [3.0, 9.0], 'left': [66.0, 90.0], 'top': [13.0, 14.0], 'width': [28.0, 31.0], 'height': [72.0, 72.0], 'img_name': '3001.png'}
{'label': [6.0, 1.0], 'left': [23.0, 38.0], 'top': [3.0, 4.0], 'width': [14.0, 7.0], 'height': [25.0, 25.0], 'img_name': '4001.png'}
{'label': [3.0, 4.0, 7.0], 'left': [65.0, 78.0, 87.0], 'top': [32.0, 32.0, 27.0], 'width': [16.0, 12.0, 14.0], 'height': [30.0, 30.0, 30.0], 'img_name': '5001.png'}
{'label': [5.0, 6.0], 'left': [56.0, 74.0], 'top': [21.0, 21.0], 'width': [19.0, 17.0], 'height': [39.0, 39.0], 'img_name': '6001.png'}
{'label': [6.0, 10.0], 'left': [39.0, 63.0], 'top': [1.0, 6.0], 'width': [23.0, 25.0], 'height': [43.0, 43.0], 'img_name': '7001.png'}
{'label': [3.0], 'left': [63.0], 'top': [13.0], 'width': [41.0], 'height': [62.0], 