- In the process of using custom datasets, it was necessary to filter relevant labels for annotations and ignore the rest. 
- All datasets were normalized in order to facilitate the merging of the various datasets.

|digit| annotation label |
|-----|------------------|
| 0   | 10               |
| 1   | 1                |
| 2   | 2                |
| 3   | 3                |
| 4   | 4                |
| 5   | 5                |
| 6   | 6                |
| 7   | 7                |
| 8   | 8                |
| 9   | 9|


In [468]:
config = {
    'phils-workspace/digits-coi4f': {
        'label_map': {
            '1': '1',
            '2': '2',
            '3': '3',
            '4': '4',
            '5': '5',
            '6': '6',
            '7': '7',
            '8': '8',
            '9': '9',
            '90': '0'
        }
    },
    'energy-meter': {
        'label_map': {
            '1': '1',
            '6': '2',
            '2': '3',
            '3': '4',
            '10': '5',
            '4': '6',
            '9': '7',
            '7': '8',
            '8': '9',
            '5': '0'
        }
    },
    'noop': {
        'label_map': {
            '1': '1',
            '2': '2',
            '3': '3',
            '4': '4',
            '5': '5',
            '6': '6',
            '7': '7',
            '8': '8',
            '9': '9',
            '0': '0'
        }
    },
}

In [469]:
from toolz.curried import *
import os
import string
import xml.etree.cElementTree as ET
from xml.dom import minidom


In [615]:
def prettify(elem):
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

def apply_annots(path, func):
  return pipe(
      path,
      os.listdir,
      filter(lambda x: x.endswith('.xml')),
      map(lambda x: os.path.join(path, x)),
      map(lambda path: pipe(
          path,
          ET.parse,
          lambda et: (func(et), path),
      )),
)

def set_element_text(element, value):
    element.text = value

def normalize_annot_et(annot_et, label_map):
  def get(key):
    return label_map[key]
  return pipe(
      annot_et,
      lambda x: x.getroot().findall('object'),
      map(lambda o: pipe(
          o,
          lambda o: o[0], # object.name
          lambda name: set_element_text(name, get(name.text)),
      )),
      list, # consumes the iterable!
      lambda _: annot_et
    )

def normalize_path_et(annot_et):
  return pipe(
      annot_et,
      lambda x: x.getroot().find('filename'),
      lambda filename: pipe(
          filename,
          lambda x: x.text,
          lambda path: os.path.split(path)[1],
          lambda out: pipe(
              out,
              lambda x: x.replace('_','_2'),
              lambda x: set_element_text(filename, x)
          ),
      ),
      lambda _: annot_et
    )
    

def normalize_annots(path, label_map):
  return apply_annots(path, lambda et: normalize_annot_et(et, label_map))

def normalize_path(path):
  return apply_annots(path, lambda et: normalize_path_et(et))


In [616]:
def normalize(path, config_key, override=False):
    for et, path in normalize_annots(path, config[config_key]['label_map']):
        if (override):
            et.write(path)
        else:
            print(path)
            print(prettify(et.getroot()))
            
def normalize_path_ds(path, override=False):
    for et, path in normalize_path(path):
        if (override):
            et.write(path)
        else:
            print(path)
            print(prettify(et.getroot()))


In [611]:
normalize('/home/jupyter/ds/phils-workspace_digits-coi4f/train','phils-workspace/digits-coi4f',True)

KeyError: '0'

In [463]:
normalize('/home/jupyter/ds/phils-workspace_digits-coi4f/test','phils-workspace/digits-coi4f',True)

In [466]:
normalize('/home/jupyter/ds/energy-meter/train','energy-meter',True)

In [467]:
normalize('/home/jupyter/ds/energy-meter/test','energy-meter',True)

In [617]:
normalize_path_ds('/home/jupyter/ds/energy-meter/train',True)

In [618]:
normalize_path_ds('/home/jupyter/ds/energy-meter/test',True)