In [50]:
from bs4 import BeautifulSoup
import glob
import os
import shutil

In [9]:
def consildate_labels(bs_xml, ptf, mergedic, dellist):
    """
    iterate over all the xml files in the annotations directory, consolidate, and copy
    :param bs_xml: list of xml annotation files
    :param ptf: path to output directory (where new files are copied)
    :param mergelist: dictionary of what to merge (mergedlabel: ['list', 'of', 'merge'])
    :param dellist: list of classes to delete
    """
    for xml in bs_xml:

        # read in the annotations
        with open(xml, 'r') as ff:
            data = ff.read()
            ff.close()

        # make it into a soup object
        bs_data = BeautifulSoup(data,'xml')

        # find all the names
        temp = bs_data.find_all('name')

        # iterate over and replace misc_fish as needed
        for ii in temp:
            
            # check if the 'name' tag needs to be merged
            for kk in mergedic:
                if ii.text in mergedic[kk]:
                    ii.contents[0].replaceWith(kk)

            # check if the object needs to be removed
            if ii.text in to_del:
                ii.parent.decompose()

            else:
                pass
        
        # create the specific file and delete it if already exists
        out_name = os.path.join(outdir, os.path.basename(xml))
        if os.path.exists(out_name):
            os.remove(out_name)
            
        # save it
        with open(os.path.join(outdir, os.path.basename(xml)), 'w') as ff:
            ff.write(str(bs_data))
            ff.close()

In [2]:
def unique_labels(bs_xml):
    """
    loop through and make a list of all available labels
    :param bs_xmls: path to xml
    :return cls: list of unique labels from all images
    """
    xmls = glob.glob(os.path.join(bs_xml,'*.xml'))
    
    cls = []
    for xml in xmls:

        # read in the annotations
        with open(xml, 'r') as ff:
            data = ff.read()
            ff.close()

        # make it into a soup object
        bs_data = BeautifulSoup(data,'xml')

        # find all the instances of names
        xx = bs_data.find_all('name')

        # iterate over and find all the unique names
        out = [item.text for item in xx]
        out = list(set(out))

        # add to the master list if not already there
        out = [line for line in out if line not in cls]
        cls.extend(out)
        
    return cls

## OP19 Merging
What to work on 

In [4]:
# starboard
#parent = 'VOCstarboardMASTER/OP19/Annotations'
#outdir = 'VOCstarboardMERGE/OP19/Annotations'

# port
parent = 'VOCportMASTER/OP19/Annotations'
outdir = 'VOCportMERGE/OP19/Annotations'

Grab all the xml files for the annotations

In [5]:
xmls = glob.glob(os.path.join(parent,'*.xml'))
print(len(xmls))

205


The rules for merging.

In [6]:
to_merge = {'misc_fish': ['oreo', 'shark', 'whiptail']}
to_del = ['worm','coral', 'shrimp']

Now go through and replace names that need replacing. Save output to the new Annotations directory

In [8]:
consildate_labels(xmls, outdir, to_merge, to_del)

# check the labels
zz = unique_labels(outdir)
print('total consolidated labels:',len(zz))
print(zz)

['orange_roughy_edge', 'sea_anemone', 'orange_roughy', 'sea_urchin', 'misc_fish', 'eel', 'mollusc', 'sea_star']


## OP16 Merging
What to work on 

In [17]:
# starboard
parent = 'VOCstarboardMASTER/OP16/Annotations'
outdir = 'VOCstarboardMERGE/OP16/Annotations'

# port
#parent = 'VOCportMASTER/OP16/Annotations'
#outdir = 'VOCportMERGE/OP16/Annotations'

Grab all the xml files for the annotations

In [18]:
xmls = glob.glob(os.path.join(parent,'*.xml'))
print(len(xmls))

130


The rules for merging.

In [19]:
to_merge = {'misc_fish': ['oreo', 'shark', 'whiptail'], 
            'orange_roughy': ['orange_roughy_1', 'orange_roughy_2'], 
            'orange_roughy_edge': ['orange_roughy_2_edge'], 
            'cnidaria': ['cnidaria_2'], 
            'sea_anemone': ['anemone']
           }

to_del = ['worm','coral', 'shrimp','spot_reflect']

Now go through and replace names that need replacing. Save output to the new Annotations directory

In [20]:
consildate_labels(xmls, outdir, to_merge, to_del)

# check the labels
zz = unique_labels(outdir)
print('total consolidated labels:',len(zz))
print(zz)

total consolidated labels: 11
['orange_roughy_edge', 'orange_roughy', 'eel', 'misc_fish', 'cnidaria', 'mollusc', 'sea_anemone', 'sea_star', 'brittle_star', 'sea_feather', 'sea_urchin']


## OP12 Merging
What to work on 

In [25]:
# starboard
#parent = 'VOCstarboardMASTER/OP12/Annotations'
#outdir = 'VOCstarboardMERGE/OP12/Annotations'

# port
parent = 'VOCportMASTER/OP12/Annotations'
outdir = 'VOCportMERGE/OP12/Annotations'

Grab all the xml files for the annotations

In [26]:
xmls = glob.glob(os.path.join(parent,'*.xml'))
print(len(xmls))

125


The rules for merging.

In [27]:
to_merge = {'misc_fish': ['oreo', 'shark', 'whiptail','chimera'], 
            'orange_roughy': ['orange_roughy_1', 'orange_roughy_2','orange_roughy_3'], 
            'orange_roughy_edge': ['orange_roughy_1_edge','orange_roughy_2_edge'], 
            'cnidaria': ['cnidaria_2'], 
            'sea_anemone': ['anemone']
           }

to_del = ['worm','coral', 'shrimp','spot_reflect','sea_weed']

Now go through and replace names that need replacing. Save output to the new Annotations directory

In [28]:
consildate_labels(xmls, outdir, to_merge, to_del)

# check the labels
zz = unique_labels(outdir)
print('total consolidated labels:',len(zz))
print(zz)

total consolidated labels: 10
['sea_feather', 'orange_roughy_edge', 'orange_roughy', 'cnidaria', 'sea_star', 'sea_anemone', 'mollusc', 'sea_urchin', 'misc_fish', 'brittle_star']


## Make necessary symlinks in new directories

In [45]:
#parent = 'VOCstarboardMASTER/OP19/JPEGImages'
#outdir = 'VOCstarboardMERGE/OP19/JPEGImages'

parent = 'VOCportMASTER/OP12/JPEGImages'
outdir = 'VOCportMERGE/OP12/JPEGImages'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [46]:
# list of images in parent dir
img_list = glob.glob(os.path.join(parent, '*.jpg'))

In [47]:
for img in img_list:
    os.symlink(img, os.path.join(outdir, os.path.basename(img)))

## Copy the lists

In [73]:
parent = 'VOCstarboardMASTER/OP12/ImageSets/Main'
outdir = 'VOCstarboardMERGE/OP12/ImageSets/Main'

#parent = 'VOCportMASTER/OP19/ImageSets/Main'
#outdir = 'VOCportMERGE/OP19/ImageSets/Main'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [74]:
# list of images in parent dir
set_list = glob.glob(os.path.join(parent, '*.txt'))
print(len(set_list))

2


In [75]:
for slist in set_list:
    shutil.copyfile(slist, os.path.join(outdir, os.path.basename(slist)))