### Import

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from tqdm import tqdm

### Read Synth Net Data

In [2]:
synthdata = pd.read_csv('../data/csv/synthnet.csv',delimiter='\t',header=None)
worddata = pd.read_csv('../data/csv/words.csv',delimiter='\t',header=None)
descriptiondata = pd.read_csv('../data/csv/descriptions.csv',delimiter='\t',header=None)
synthdata.columns = ['code','sythnames']
worddata.columns = ['code','words']
descriptiondata.columns = ['code','description']

### Get Lists

In [3]:
synthcodes = synthdata['code'].values.tolist()
synthword = synthdata['sythnames'].values.tolist()
wordcodes = worddata['code'].values.tolist()
wordword = worddata['words'].values.tolist()
descriptioncodes = descriptiondata['code'].values.tolist()
descriptionword = descriptiondata['description'].values.tolist()

### Find all training images that match the synthdata

In [4]:
isPresent = []
basepath = "../data/orig/train/"
shortnames = []
names = []
paths = []
nImages = []
isPresents = []
descriptions = []
codes = []
for m,mydir in tqdm(enumerate(os.listdir(basepath))):

    #Get Name
    if (mydir != '.DS_Store'):
        path = '/'
        name = ''
        description = ''
        nImage = 0
        ispresent = False
        if (mydir in synthcodes):
            index = synthcodes.index(mydir)
            name = synthword[index].lower()
            path = basepath + mydir
            ispresent = True
            for root, dirs, files in os.walk(basepath + mydir):
                nImage = len(files)
        if (mydir in wordcodes):
            index = wordcodes.index(mydir)
            name = wordword[index].lower()
            path = basepath + mydir
            ispresent = True
            for root, dirs, files in os.walk(basepath + mydir):
                nImage = len(files)

        #Get Description
        if (mydir in descriptioncodes):
            index = descriptioncodes.index(mydir)
            description = descriptionword[index].lower()

        #Add to list
        elements = name.split(',')
        shortname = elements[0]
        names.append(name)
        shortnames.append(shortname)
        paths.append(path)
        descriptions.append(description)
        isPresents.append(ispresent)
        nImages.append(nImage)
        codes.append(mydir)

1001it [00:08, 113.35it/s]


### Create New Dataframe

In [5]:
C = pd.DataFrame()
C['code'] = codes
C['shortname'] = shortnames
C['name'] = names
C['description'] = descriptions
C['path'] = paths
C['nImages'] = nImages
C['isPresent'] = isPresents

### Store

In [6]:
C.to_csv('../data/csv/imagenet.csv',index=False)

### Find all Images and Bounding Boxes (if present)

In [7]:
basepath = "../data/annotations/train/"
imagepath = "../data/orig/train/" 
mydirs = os.listdir(imagepath)[1:]
xmins = []
ymins = []
xmaxs = []
ymaxs = []
hasboundingbox = []
imagefiles = []
xmlfiles = []
I = []
for mydir in tqdm(mydirs):
    
    #Get Files
    if (mydir in codes):
        subset = C[C['code'] == mydir]
        dp = subset.iloc[0]
        myfiles = os.listdir(imagepath + mydir)
        for myfile in myfiles:
            imgfile = imagepath + mydir + '/' + myfile.replace('xml','JPEG')
            xmlfile = basepath + mydir + '/' + myfile.replace('JPEG','xml')
            if os.path.exists(xmlfile):
                tree = ET.parse(xmlfile)
                root = tree.getroot()
                xmin = int(root[5][4][0].text)
                ymin = int(root[5][4][1].text)
                xmax = int(root[5][4][2].text)
                ymax = int(root[5][4][3].text)
                hasboundingbox.append(True)
            else:
                xmlfile = ''
                xmin = 0
                ymin = 0
                xmax = 0
                ymax = 0
                hasboundingbox.append(False)
            xmins.append(xmin)
            ymins.append(ymin)
            xmaxs.append(xmax)
            ymaxs.append(ymax)
            xmlfiles.append(xmlfile)
            imagefiles.append(imgfile)
            I.append((dp.code,dp.shortname,dp.name,dp.description,dp.path,dp.nImages,dp.isPresent))
I = pd.DataFrame(I)
I.columns = C.columns
I['imgfiles'] = imagefiles
I['xmlfiles'] = xmlfiles
I['hasboundingbox'] = hasboundingbox
I['xmins'] = xmins
I['ymins'] = ymins
I['xmaxs'] = xmaxs
I['ymaxs'] = ymaxs

100%|██████████| 1000/1000 [03:50<00:00,  4.33it/s]


### Store to CSV

In [8]:
I.to_csv('../data/csv/imagenet_files.csv',index=False)