# Parsing ICD 10 Codes

In [29]:
import json
import csv
import pandas as pd
import pprint
from collections import defaultdict

pp = pprint.PrettyPrinter(indent=2)

DATA_PATH = '../../data/'

## Get codes from CSV files and their levels

In [30]:
low_level_icd10_filename = DATA_PATH + 'icd10cm_order_2016.csv'

icd10_df = pd.read_csv(low_level_icd10_filename)

In [3]:
icd10_df.head()

Unnamed: 0,nr,NAME,leaf,DESC,DESCRIPTION
0,1,A00,0,Cholera,Cholera
1,2,A000,1,"Cholera due to Vibrio cholerae 01, biovar chol...","Cholera due to Vibrio cholerae 01, biovar chol..."
2,3,A001,1,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor"
3,4,A009,1,"Cholera, unspecified","Cholera, unspecified"
4,5,A01,0,Typhoid and paratyphoid fevers,Typhoid and paratyphoid fevers


In [4]:
icd10_df['zero_level'] = icd10_df['NAME'].apply(lambda x: x[0])
icd10_df['first_level'] = icd10_df['NAME'].apply(lambda x: x[1:3])
icd10_df['second_level'] = icd10_df['NAME'].apply(lambda x: x[3] if len(x) > 3 else None)
icd10_df['third_level'] = icd10_df['NAME'].apply(lambda x: x[4] if len(x) > 4 else None)
icd10_df['fourth_level'] = icd10_df['NAME'].apply(lambda x: x[5] if len(x) > 5 else None)
icd10_df['fifth_level'] = icd10_df['NAME'].apply(lambda x: x[6] if len(x) > 6 else None)

In [5]:
def get_level(code):
    l = 0
    levels = ['first_level', 'second_level', 'third_level', 'fourth_level', 'fifth_level']
    for level in levels:
        if code[level] is not None:
            l += 1
    return l

icd10_df['level'] = icd10_df.apply(lambda x: get_level(x), axis=1)


In [6]:
icd10_df.head()

Unnamed: 0,nr,NAME,leaf,DESC,DESCRIPTION,zero_level,first_level,second_level,third_level,fourth_level,fifth_level,level
0,1,A00,0,Cholera,Cholera,A,0,,,,,1
1,2,A000,1,"Cholera due to Vibrio cholerae 01, biovar chol...","Cholera due to Vibrio cholerae 01, biovar chol...",A,0,0.0,,,,2
2,3,A001,1,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor",A,0,1.0,,,,2
3,4,A009,1,"Cholera, unspecified","Cholera, unspecified",A,0,9.0,,,,2
4,5,A01,0,Typhoid and paratyphoid fevers,Typhoid and paratyphoid fevers,A,1,,,,,1


In [7]:
icd10_first_levels_df = icd10_df[icd10_df['level'] == 1]
icd10_second_levels_df = icd10_df[icd10_df['level'] == 2]
icd10_third_levels_df = icd10_df[icd10_df['level'] == 3]
icd10_fourth_levels_df = icd10_df[icd10_df['level'] == 4]
icd10_fifth_levels_df = icd10_df[icd10_df['level'] == 5]

## Get JSON ranges

In [8]:
high_level_icd10_filename = DATA_PATH + 'ICD10_fixed.json'

with open(high_level_icd10_filename, 'r') as f:
    icd10_dict = json.load(f)

In [9]:
for high_level in icd10_dict['ICD']:
    lower, upper = high_level['name'].split('-')
    high_level['lower'] = lower
    high_level['upper'] = upper + 'z' # This is a hack so that A192 will still be under A19 for example
    for mid_level in high_level['children']:
        lower, upper = mid_level[' name'].split('-')
        mid_level['lower'] = lower
        mid_level['upper'] = upper + 'z'
        mid_level['name'] = mid_level[' name']
        mid_level['parent'] = high_level['name']
        

## <span style="background-color: #FFFF00"> A few fixes and special cases:  </span>

* M05-M1A - This can be also M05-M14, which we do have in the JSON file, so we're handling the case where M1A is under M05-M14.
* C43-C4A - Same case as M05-M1A
* Z30-Z3A - Very similar to M05-M1A, we have Z30-Z39 instead, and Z3A goes under there.
* J96-J99 - Added to the JSON file
* O00-O08 and O00-O09 - were under N instead of O, moved them to the correct places in the JSON file.
* R00-R09 - was under Q instead of R, I moved it to the correct place in the JSON file 

#### Other non-trivial cases that were handled:
* 'X' as a placeholder in codes, so we can have H410->H410X1, O32->O32XX1 and even a deeper level with XXX
* Nested ranges:  A00-B99->A00-A09

## <span style="background-color: #FFFF00"> Special cases that still need to be handled: </span>
* C7A-C7A
* C7B-C7B
* D3A-D3A

## <span style="background-color: #FFFF00"> Other issues: </span>
* M04-M04 - X50 is not present in the CSV file. This is a new code (more recent than the provided data)
* M97-M97 - The [codes](https://www.icd10data.com/ICD10CM/Codes/M00-M99/M97-M97/M97-) are not present in the CSV file. This is a new code (more recent than the provided data).
* X50-X50 - X50 is not present in the CSV file. This is a new code (more recent than the provided data).
* Z19-Z19 - Z19 is not present in the CSV file. This is a new code (more recent than the provided data).

In [10]:
def populate_code_aux(code, curr_level):   
    code_name = code['NAME']
    for c in curr_level:
        c_name = c['name']
        # if we have bounds - check them. else - check the prefix of the code, and handle special case
        is_in_child = ((('lower' in c and 'upper' in c) and (code_name >= c['lower'] and code_name <= c['upper'])) or 
                       (code_name[:len(c_name)] == c_name) or 
                       (code_name[:3] == 'C4A' and c_name == 'C43-C44') or
                       (code_name[:3] == 'M1A' and c_name == 'M05-M14') or
                       (code_name[:3] == 'Z3A' and c_name == 'Z30-Z39'))
        if is_in_child:
            # This is the correct child
            does_have_range_child = False
            does_have_next_level_child = False
            for grndchild in c['children']:
                if '-' in grndchild['name']:
                    does_have_range_child = True
                elif ((grndchild['name'][:-1] == c_name) or 
                      ('-' in c_name and len(grndchild['name']) == len(c['lower']))):
                    does_have_next_level_child = True
                    
            # The first condition handles the regular case A4030->A40301
            # The second condition handles the special case of C43-C44->C4A
            # The third condition handles the case H410->H410X1
            # The fourth condition handles the case O32->O32XX1
            # The fifth condition handles the case A00-B99->A00-A09->A00, apecifically A00-A09->A00 
            # 
            
#             if code_name == 'O037':
#                 print('code name O037')
#                 print(c_name)
#                 print(('-' not in c_name and code_name[:-1] == c_name))
# #                 print('lower' in c and 'upper' in c and len(code_name) == len(c['lower'])+1 and not does_have_range_child)
#                 print(not does_have_next_level_child and code_name[:-2] == c_name)
#                 print(not does_have_next_level_child and code_name[:-3] == c_name)
#                 print('-' not in code_name and 'lower' in c and 'upper' in c and len(code_name) == len(c['lower']) and not does_have_range_child)
#                 print('=================')
#                 print('-' not in code_name)
#                 print('lower' in c and 'upper' in c)
#                 print(len(code_name) == len(c['lower']))
#                 print(not does_have_range_child)
#                 for grndchild in c['children']:
#                     if '-' in grndchild['name']:
#                         print(grndchild['name'])
                
#                 print('\n\n')
                
            if (('-' not in c_name and code_name[:-1] == c_name) or
#                 ('lower' in c and 'upper' in c and len(code_name) == len(c['lower'])+1 and not does_have_range_child) or 
                (code_name == 'C4A' and c_name == 'C43-C44') or
                (code_name == 'M1A' and c_name == 'M05-M14') or
                (code_name == 'Z3A' and c_name == 'Z30-Z39') or
                (not does_have_next_level_child and code_name[:-2] == c_name and 'X' in code_name) or 
                (not does_have_next_level_child and code_name[:-3] == c_name and 'XX' in code_name) or # These two won't work in the case where
                (not does_have_next_level_child and code_name[:-4] == c_name and 'XXX' in code_name) or # we have something like A00-A10->A00X1->A00X12
                # so we need to verify there are no such cases in ICD 10
                ('-' not in code_name and 'lower' in c and 'upper' in c and len(code_name) == len(c['lower']) and not does_have_range_child)): 
                # current child is the parent level. Let's populate and return
#                 if code_name == 'O037':
#                     print(f'appending O037 to \n{c}')
                c['children'].append({'children': [],
                                      'name': code_name,
                                      'description': code['DESCRIPTION'], 
                                      'parent': c['name']})
                return
            else:
                populate_code_aux(code, c['children'])

def populate_code_in_tree(code):
    populate_code_aux(code, icd10_dict['ICD'])

In [11]:
_ = icd10_first_levels_df.apply(lambda x: populate_code_in_tree(x), axis=1)

In [12]:
_ = icd10_second_levels_df.apply(lambda x: populate_code_in_tree(x), axis=1)

In [13]:
_ = icd10_third_levels_df.apply(lambda x: populate_code_in_tree(x), axis=1)

In [14]:
_ = icd10_fourth_levels_df.apply(lambda x: populate_code_in_tree(x), axis=1)

In [15]:
_ = icd10_fifth_levels_df.apply(lambda x: populate_code_in_tree(x), axis=1)

In [16]:
icd10 = {'name': 'ICD10', 'children': icd10_dict['ICD']}

In [17]:
def remove_redundant_items(tree):
    tree.pop(' name', None)
    tree.pop('nr', None)
    tree.pop('level0', None)
    tree.pop(' level1', None)
    for c in tree['children']:
        remove_redundant_items(c)

In [18]:
remove_redundant_items(icd10)

In [25]:
def set_nodes_indices(tree, levels):
    i = 0
    for child in tree['children']:
        child_levels = levels + [i]
        child['child_idx'] = child_levels
        set_nodes_indices(child, child_levels)
        i += 1

In [26]:
set_nodes_indices(icd10, [])

In [27]:
icd10['children'][0]['children'][0]['children']

[{'children': [{'children': [],
    'name': 'A000',
    'description': 'Cholera due to Vibrio cholerae 01, biovar cholerae',
    'parent': 'A00',
    'child_idx': [0, 0, 0, 0]},
   {'children': [],
    'name': 'A001',
    'description': 'Cholera due to Vibrio cholerae 01, biovar eltor',
    'parent': 'A00',
    'child_idx': [0, 0, 0, 1]},
   {'children': [],
    'name': 'A009',
    'description': 'Cholera, unspecified',
    'parent': 'A00',
    'child_idx': [0, 0, 0, 2]}],
  'name': 'A00',
  'description': 'Cholera',
  'parent': 'A00-A09',
  'child_idx': [0, 0, 0]},
 {'children': [{'children': [{'children': [],
      'name': 'A0100',
      'description': 'Typhoid fever, unspecified',
      'parent': 'A010',
      'child_idx': [0, 0, 1, 0, 0]},
     {'children': [],
      'name': 'A0101',
      'description': 'Typhoid meningitis',
      'parent': 'A010',
      'child_idx': [0, 0, 1, 0, 1]},
     {'children': [],
      'name': 'A0102',
      'description': 'Typhoid fever with heart invol

In [28]:
# Verify these are the correct indices
icd10['children'][0]['children'][0]['children'][2]['children'][2]['children'][0]

{'children': [],
 'name': 'A0220',
 'description': 'Localized salmonella infection, unspecified',
 'parent': 'A022',
 'child_idx': [0, 0, 2, 2, 0]}

In [31]:
# Add value=1 to all leaves
def add_default_values(tree):
    if tree['children'] == []:
        tree['value'] = 1
    for child in tree['children']:
        add_default_values(child)

In [32]:
add_default_values(icd10)

In [33]:
with open(DATA_PATH + 'icd10_full.json', 'w') as f:
    json.dump(icd10, f)

# Verifications

In [None]:
 def operate_on_leaves(tree, do_fn):
    if tree['children'] == []:
        return do_fn(tree) 
    fn_success_in_one_leaf = False
    for child in tree['children']:
        if operate_on_leaves(child, do_fn):
            fn_success_in_one_leaf = True
    return fn_success_in_one_leaf

In [None]:
 def operate_on_all_nodes(tree, do_fn):
    do_fn(tree)
    if tree['children'] == []:
        return
    for child in tree['children']:
        operate_on_all_nodes(child, do_fn)

In [None]:
def verify_leaf(node):
    try:
        node_name = node['name']
        if '-' in node_name:
            return
        current_node = icd10_df[icd10_df['NAME'] == node['name']]
        current_node_is_leaf_series = current_node['leaf']
        current_node_is_leaf_series_with_reset_index = current_node_is_leaf_series.reset_index()
        curr_node_reindexed_leaf = current_node_is_leaf_series_with_reset_index['leaf']
        is_leaf = curr_node_reindexed_leaf[0]
        assert is_leaf == 1, f'The current node leaf series is {current_node_is_leaf_series}'
    except:
        print(f'The current node is {node_name}')
        print(icd10_df[icd10_df['NAME'] == node['name']])
        assert False

In [None]:
operate_on_leaves(icd10, verify_leaf)

In [None]:
def count_leaves(leaf):
    global all_leaves
    global num_of_leaves
    num_of_leaves += 1
    all_leaves[leaf['name']] += 1

In [None]:
all_leaves = defaultdict(int)
num_of_leaves = 0
operate_on_leaves(icd10, count_leaves)
print(f'Number of leaves in the tree: {num_of_leaves}')

In [None]:
# Verify each leaf code only appears once
for leaf in all_leaves:
    if all_leaves[leaf] != 1:
        print(leaf)
        print(all_leaves[leaf])
        print('====')

In [None]:
leaves_names = set(all_leaves.keys())
csv_leaves = set([l for l in icd10_df[icd10_df['leaf'] == 1]['NAME']])

print(f'The following nodes are leaves in the tree, but shouldnt be:\n{leaves_names - csv_leaves}\n')
print(f'The following nodes are not leaves in the tree, but should be:\n{csv_leaves - leaves_names}')

In [None]:
icd10_df[icd10_df['NAME'] >= 'D3A'][icd10_df['NAME'] < 'D3B']

In [None]:
node_name = ''
def find_leaf(node):
#     if node['name'] == 'A001':
#         print(f'current node_name is {node_name}')
#         print('found A001!')
    if node['name'] == node_name:
        return True

In [None]:
operate_on_leaves(icd10, find_leaf)


In [None]:
def find_node(node):
    if node['name'] == 'D3A':
        pp.pprint(node)
        return True

In [None]:
operate_on_all_nodes(icd10, find_node)

In [None]:
for leaf in icd10_df[icd10_df['leaf'] == 1]['NAME']:
    global node_name
    node_name = leaf
    node_found = operate_on_leaves(icd10, find_leaf)
    if not node_found:
        print(f'Node {node_name} was not found in tree!')

In [None]:
icd10_df[icd10_df['NAME'] == 'M04']

In [None]:
icd10_df[icd10_df['NAME'] == 'K']

In [None]:
icd10_df[icd10_df['NAME'] > 'A00'][icd10_df['NAME'] < 'A01']