#Data to Taxonomy Pipeline

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from string import join, lowercase, uppercase

###Useful Functions

In [3]:
def taxonomy_from_paths(P):
    
    T = {}
    for p in P:
        t = T
        for k in p:
            if k not in t.keys(): t[k] = {}
            t = t[k]
            
    return T

In [4]:
def paths_from_taxonomy(T):
    
    def f(T, p=[]):
        if T == {}:
            P.append(p)
        else:
            for k in sorted(T.keys()):
                f(T[k], p+[k])
    
    P = []
    f(T)
    
    return P

In [5]:
def read_taxonomy(f):

    with open(f, 'rb') as file:
        P = [[k.split('-') for k in p.split('/')] 
             for p in file.readlines()]
    
    fix = lambda k: filter(lambda c: c in lowercase, k.lower())
    P = [[join(map(fix, k), '-') for k in p] for p in P]
    
    return P

###Clean Taxonomy

In [6]:
def remove_leaves(T):
    for k in T.keys():
        if T[k] == {}: 
            del T[k]
        else:
            remove_leaves(T[k])

In [7]:
with open('taxonomy_messy.txt', 'rb') as file: P = file.readlines()
P = filter(lambda s: all(c not in uppercase for c in s), P)
P = [[k.split('-') for k in p.split('/')] for p in P]
fix = lambda s: filter(lambda c: c in lowercase, s.lower())
P = [[join(map(fix, k), '-') for k in p] for p in P]
T = taxonomy_from_paths(P)
remove_leaves(T)
P = paths_from_taxonomy(T)
with open('taxonomy.txt', 'wb') as file:
    for p in P: file.write('{}\n'.format(join(p,'/')))

###Sanity Check

In [8]:
P = read_taxonomy('taxonomy.txt')
P == paths_from_taxonomy(taxonomy_from_paths(P))

True

###Read Labels

In [271]:
with open('visualdx/labels.txt') as file:
    labels = [label.split('|')[1].split() for label in file.readlines()]

In [272]:
fix = lambda s: filter(lambda c: c in string.lowercase, s.lower())
labels = [map(fix, label) for label in labels]

In [273]:
sorted(set([string.join(label, ' ') for label in labels]))

['abscess',
 'abscess of the newborn',
 'acanthoma epidermolytic',
 'acanthoma large cell',
 'acanthosis nigricans',
 'accessory tragus',
 'acne conglobata',
 'acne excorie',
 'acne fulminans',
 'acne infantile',
 'acne keloidalis nuchae',
 'acne neonatal',
 'acne steroid',
 'acne vulgaris',
 'acquired acrodermatitis enteropathica',
 'acral erythema',
 'acroangiodermatitis',
 'acrochordon',
 'acrokeratoelastoidosis',
 'acrokeratosis paraneoplastica',
 'acrokeratosis verruciformis',
 'acroosteolysis',
 'acropustulosis of infancy',
 'actinic keratosis',
 'actinic prurigo',
 'actinomycosis',
 'acute generalized exanthematous pustulosis',
 'acute hemorrhagic edema of infancy',
 'acute myelomonocytic leukemia',
 'adamsoliver syndrome',
 'addison disease',
 'adiposis dolorosa',
 'adrenogenital syndrome',
 'adult mastocytosis',
 'adultonset stills disease',
 'aflatoxicosis',
 'african tick bite fever',
 'africanized bee sting',
 'ainhum',
 'albinism',
 'alopecia areata',
 'alopecia drug induc