- In the process of using custom datasets, it was necessary to filter relevant labels for annotations and ignore the rest. 
- All datasets were normalized in order to facilitate the merging of the various datasets.

|digit| annotation label |
|-----|------------------|
| 0   | 10               |
| 1   | 1                |
| 2   | 2                |
| 3   | 3                |
| 4   | 4                |
| 5   | 5                |
| 6   | 6                |
| 7   | 7                |
| 8   | 8                |
| 9   | 9|


In [448]:
config = {
    'phils-workspace/digits-coi4f': {
        'label_map': {
            '1': '1',
            '2': '2',
            '3': '3',
            '4': '4',
            '5': '5',
            '6': '6',
            '7': '7',
            '8': '8',
            '9': '9',
            '90': '10'
        }
    },
    'energy-meter': {
        'label_map': {
            '1': '1',
            '6': '2',
            '2': '3',
            '3': '4',
            '10': '5',
            '4': '6',
            '9': '7',
            '7': '8',
            '8': '9',
            '5': '10'
        }
    },
}

In [432]:
from toolz.curried import *
import os
import string
import xml.etree.cElementTree as ET
from xml.dom import minidom


In [440]:
def prettify(elem):
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

def normalize_annots(path, label_map, override=False):
  return pipe(
      path,
      os.listdir,
      filter(lambda x: x.endswith('.xml')),
      map(lambda x: os.path.join(path, x)),
      map(lambda path: pipe(
          path,
          ET.parse,
          lambda et: (normalize_annot_et(et, label_map), path),
      )),
)

def normalize_annot_et(annot_et, label_map):
  def set_element_text(element, value):
    element.text = value

  def get(key):
    return label_map[key]

  return pipe(
      annot_et,
      lambda x: x.getroot().findall('object'),
      map(lambda o: pipe(
          o,
          lambda o: o[0], # object.name
          lambda name: set_element_text(name, get(name.text)),
      )),
      list, # consumes the iterable!
      lambda _: annot_et
    )

In [442]:
def normalize(path, config_key, override=False):
    for et, path in normalize_annots(path, config[config_key]['label_map']):
        if (override):
            et.write(path)
        else:
            print(path)
            print(prettify(et.getroot()))

In [444]:
normalize('/home/jupyter/ds/phils-workspace_digits-coi4f/train','phils-workspace/digits-coi4f',True)

In [445]:
normalize('/home/jupyter/ds/phils-workspace_digits-coi4f/test','phils-workspace/digits-coi4f',True)

In [450]:
normalize('/home/jupyter/ds/energy-meter/train','energy-meter',True)

In [452]:
normalize('/home/jupyter/ds/energy-meter/test','energy-meter',True)