In [109]:
# Converts JSON file to a dataframe format outlined by the research paper.

In [110]:
import os
import pandas as pd
import numpy as np
import textwrap
import datetime
import pytz
import json
import re

DATA_FILE = "data_storage.json"

# Number of paired ICDs and probabilities we want to capture
PAIRS = 5



In [111]:
# F(x): Initialize the data storage dictionary

def load_data(filename=DATA_FILE):
    
    if os.path.exists(filename):
        print(f"{filename} found. Loading data...")
        with open(filename, 'r') as file:
            data = json.load(file)
        return data
    else:
        print(f"{filename} not found. Initializing empty dictionary...")
        return {}

def save_data(data, filename=DATA_FILE):
    # Save data to a file    
    with open(filename, 'w') as file:
        json.dump(data, file)

In [None]:
# # BACKUP
# # F(x): Extract ICD probabilities from tokens

# def extract_icd_probabilities(logprobs, output="simple"):
#     parsed_icd = []
#     parsed_icd_logprobs = []
#     for pos in range(len(logprobs)):
#         print(logprobs[pos: pos+4])
#         temp_df = pd.DataFrame(logprobs[pos: pos+4])
#         display(temp_df)
#         temp_df = temp_df[temp_df[0].notna() & (temp_df[0].str.strip() != '')]
#         temp_df = temp_df[temp_df[0].str.strip() != '\n']
#         temp_concat = ''.join(temp_df.iloc[:, 0]).strip()
#         if len(temp_concat) > 9:
#             continue
#         # pattern = r'^[A-Z]\d{0,4}(\.\d{0,4})?$'
#         pattern_4part = r'^[A-Z]\d{0,4}(\.\d{1,4})?$'
#         match = re.match(pattern_4part, temp_concat)

#         if match:
#             # print(f"{temp_concat} - valid ICD {np.round((np.exp(temp_df.iloc[:, 1]).mean())*100,2)}%")
#             # print(f"**** {temp_concat} - VALID 2-parts ICD ****")
#             parsed_icd.append((temp_concat, (np.exp(temp_df.iloc[:, 1]).mean())))
#             # display(temp_df)
#             parsed_icd_logprobs.append(temp_df.rename(columns={0: 'token', 1:'logprob'}).to_dict(orient='list'))
#         else:
#             # print(f"{temp_concat} - invalid 4-parts.")
            
#             #trying 2-parts
#             temp_df = pd.DataFrame(logprobs[pos: pos+2])
#             temp_concat = ''.join(temp_df.iloc[:, 0]).strip()
#             pattern_2part = r'^[A-Z]\d{1,4}$'
#             match = re.match(pattern_2part, temp_concat)
#             if match:
#                 # print(f"**** {temp_concat} - VALID 2-parts ICD ****")
#                 parsed_icd.append((temp_concat, (np.exp(temp_df.iloc[:, 1]).mean())))
#                 parsed_icd_logprobs.append(temp_df.rename(columns={0: 'token', 1:'logprob'}).to_dict(orient='list'))
#             else:
#                 # print(f"{temp_concat} - invalid 2-parts.")
#                 pass
#             pass
#     if output == "logprobs":
#         return parsed_icd, parsed_icd_logprobs
#     else:
#         return parsed_icd

In [227]:
# F(x): Extract ICD probabilities from tokens

test = [['A', -0.63648945],  ['09', -1.4643841], ['\n', -0.9866263], ['R', -0.6599979], ['50', -1.5362289],
 ['.', -0.05481864],  ['9', -0.002321772], ['\n', -0.3524723], ['R', -0.56709456], ['11', -1.263591],
 ['.', -0.05834798], ['0', -0.73551023], ['\n', -0.5051807], ['R', -0.65759194], ['63', -1.0282977],
 ['.', -0.0006772888], ['4', -0.71002203]]

def extract_icd_probabilities(logprobs, debug=False):
    parsed_icds = []
    tmp_df = pd.DataFrame(logprobs)
    if debug > 0:
        print(repr(''.join(tmp_df.iloc[:,0])))
    tmp_df_limit = len(tmp_df)
    for pos in range(tmp_df_limit):
        temp_concat_ANN = ''.join(tmp_df.iloc[pos:pos+2, 0]) # concatenate 2 parts

        temp_concat_ANN_NNN = ''.join(tmp_df.iloc[pos:pos+4, 0]) # concatenate 4 parts
        temp_concat_ANN_NNN_A = ''.join(tmp_df.iloc[pos:pos+5, 0]) # concatenate 4 parts
        
        # Reference: https://www.webpt.com/blog/understanding-icd-10-code-structure
        pattern_ANN = r"^[A-Z]\d{2}$" # pattern for ANN
        pattern_ANN_NNN = r"^[A-Z]\d{2}\.\d{1,3}$" # pattern for ANN.NNN
        pattern_ANN_NNN_A = r"^[A-Z]\d{2}\.\d{3}[A-Z]$" # pattern for ANN.NNNA. For the last alphabet to be valid, there must be 6 previous characters
        
        match_ANN = re.match(pattern_ANN, temp_concat_ANN)
        match_ANN_NNN = re.match(pattern_ANN_NNN, temp_concat_ANN_NNN)
        match_ANN_NNN_A = re.match(pattern_ANN_NNN_A, temp_concat_ANN_NNN_A)
        
        # print result for debug
        if debug == 2:
            print(str(pos).ljust(4), repr(temp_concat_ANN).ljust(10), ('yes' if match_ANN else 'no').ljust(15), repr(temp_concat_ANN_NNN).ljust(10), ('yes' if match_ANN_NNN else 'no').ljust(15), repr(temp_concat_ANN_NNN_A).ljust(10), ('yes' if match_ANN_NNN_A else 'no').ljust(5))
        
        # Assumption: assumes a valid ICD would span 2, 4, or 5 tokens
        if match_ANN_NNN_A:
            winning_df = pd.DataFrame(logprobs[pos:pos+5])
            winning_icd = temp_concat_ANN_NNN_A            
        elif match_ANN_NNN:
            winning_df = pd.DataFrame(logprobs[pos:pos+4])
            winning_icd = temp_concat_ANN_NNN            
        elif match_ANN:
            winning_df = pd.DataFrame(logprobs[pos:pos+2])
            winning_icd = temp_concat_ANN            
        else:
            continue
        
        if debug == 2:
            print(f"**** {winning_icd} - VALID ICD ****")
            display(winning_df)
        
        winning_mean = np.exp(winning_df.iloc[:, 1]).mean()
        
        winning_package = {
            'icd': winning_icd,
            'icd_linprob_mean': winning_mean,
            'logprobs': winning_df.rename(columns={0: 'token', 1:'logprob'}).to_dict(orient='list')
        }
        
        parsed_icds.append(winning_package)
        
    if debug > 0:
        display(parsed_icds) 
    return parsed_icds
    


test_output = extract_icd_probabilities(test)
test_output


[{'icd': 'A09',
  'icd_linprob_mean': 0.3801835598412292,
  'logprobs': {'token': ['A', '09'], 'logprob': [-0.63648945, -1.4643841]}},
 {'icd': 'R50.9',
  'icd_linprob_mean': 0.6690953098673259,
  'logprobs': {'token': ['R', '50', '.', '9'],
   'logprob': [-0.6599979, -1.5362289, -0.05481864, -0.002321772]}},
 {'icd': 'R11.0',
  'icd_linprob_mean': 0.5680976690938437,
  'logprobs': {'token': ['R', '11', '.', '0'],
   'logprob': [-0.56709456, -1.263591, -0.05834798, -0.73551023]}},
 {'icd': 'R63.4',
  'icd_linprob_mean': 0.5916672403792442,
  'logprobs': {'token': ['R', '63', '.', '4'],
   'logprob': [-0.65759194, -1.0282977, -0.0006772888, -0.71002203]}}]

In [170]:
# Load JSON data and convert to dataframe
data_storage = load_data()
df = pd.DataFrame(data_storage).T

# to speed up processing, we will sample 100 rows
df = df.loc[['14004747', '14002839', 
             '14002323', '14001355', '14000201', '14005633',
       '24000550', '24002181', '24000721', '24000129', '24000117', '24000186',
       '14002203', '14006139', '24003520',
       '14002421', '14009193', '24002598',
       ]]
# df = df.sample(500)

data_storage.json found. Loading data...


In [95]:
# # Finding extract long outputs
# df[df.icds.apply(lambda x: len(x)) > 2].index

Index(['14004747', '14002839', '14002323', '14001355', '14000201', '14005633',
       '24000550', '24002181', '24000721', '24000129', '24000117', '24000186',
       '14002203', '14006139', '24003520'],
      dtype='object')

In [218]:
# # Quickly test the mean of logprobs
# np.mean(np.exp(pd.DataFrame([
#     ["V", -0.80707335],
#       ["89", -0.5674744],
#       [".", -0.07485282],
#       ["2", -0.049951375],
#     ]).iloc[:,1]))

0.7230682932105017

In [225]:
# extract information from logprobs
df['output'] = df['logprobs'].apply(extract_icd_probabilities)

In [114]:
# # Showing `output_msg` that exceeds ICD length
# abnormal_output_df = df[df['output_msg'].apply(lambda x:len(x) > 8)][['output_msg']]
# print(f"{abnormal_output_df.shape[0]} rowids with output_msg exceeding normal ICD length")
# print("Example:")
# print(abnormal_output_df.head(5))
# # df[df['output_msg'].apply(lambda x:len(x) > 8)][['output_msg','icds','best_icd']]

In [None]:
# # Old function to extract icds from output, and save the best icd to a new column
# df[['icds', 'parsed_icd_logprobs']] = df.apply(lambda x: pd.Series(extract_icd_probabilities(x['logprobs'], output="logprobs")), axis=1)
# df['best_icd'] = df.apply(lambda x: pd.DataFrame(x['icds']).sort_values(by=1, ascending=False).iloc[0,0], axis=1)

In [107]:
# F(x): Given a list of ICDs in form of a list of tuples, convert each ICD into 1-dimension Series

# e.g. 24000721, 14002323
# df.loc[['24000721','24002173','14000052']]


def explode_icds(value, pairs=PAIRS):
    tmp = pd.DataFrame(value) # convert list of tuples to dataframe
    tmp = tmp.sort_values(by=1, ascending=False) # sort by descending probability
    tmp = tmp.stack().reset_index(drop=True) # convert to 1 row
    tmp = tmp.reindex(range(pairs*2), axis=1) # pad to fill PAIRS*2 columns
    return tmp
    # return pd.DataFrame(pd.DataFrame(value)[0].apply(pd.Series).stack().reset_index(drop=True)).T
    
icd_column_names_mapping = {i: f"cause{i}_icd10" if i % 2 == 0 else f"cause{i}_icd10_prob" for i in range(PAIRS*2)}


df.icds.apply(explode_icds) #.rename(columns=icd_column_names_mapping)
# df.join(df.icds.apply(explode_icds).rename(columns=icd_column_names_mapping))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
14004747,R50.9,0.669095,R63.4,0.591667,R11.0,0.568098,A09,0.380184,,
14002839,A,0.98989,X5,0.903661,T43.6,0.900937,,,,
14002323,N17.9,0.689487,T79.3,0.668288,T88.9,0.57493,,,,
14001355,D62,0.811189,D62,0.811189,K59,0.50993,,,,
14000201,R57.0,0.761464,J18.9,0.738739,K92.1,0.685314,R11.2,0.589249,K29.5,0.53433
14005633,K65,0.401702,B50,0.355547,B50,0.355547,,,,
24000550,B50,0.612482,B50,0.612482,K65,0.533676,,,,
24002181,A,0.995012,S36.8,0.786977,X9,0.341037,,,,
24000721,J45.909,0.746794,V89.2,0.723068,T79.6,0.59259,,,,
24000129,R60.9,0.94135,I63.9,0.728296,H54,0.490848,,,,


In [238]:
# F(x): Given a list of ICDs in form of a list of tuples, convert each ICD into 1-dimension Series

def output_icds_to_cols(value, pairs=PAIRS):
    tmp = pd.DataFrame(value) # convert list of tuples to dataframe
    tmp = tmp.sort_values(by="icd_linprob_mean", ascending=False) # sort by descending probability
    tmp = tmp.drop(columns=['logprobs'])
    tmp = tmp.stack().reset_index(drop=True) # convert to 1 row
    tmp = tmp.reindex(range(pairs*2), axis=1) # pad to fill PAIRS*2 columns
    return tmp

# Test
# output_icds_to_cols(test_output)

Unnamed: 0,cause0_icd10,cause1_icd10_prob,cause2_icd10,cause3_icd10_prob,cause4_icd10,cause5_icd10_prob,cause6_icd10,cause7_icd10_prob,cause8_icd10,cause9_icd10_prob
14004747,R50.9,0.669095,R63.4,0.591667,R11.0,0.568098,A09,0.380184,,
14002839,T43.6,0.900937,,,,,,,,
14002323,N17.9,0.689487,T79.3,0.668288,T88.9,0.57493,,,,
14001355,D62,0.811189,K59,0.50993,,,,,,
14000201,R57.0,0.761464,J18.9,0.738739,K92.1,0.685314,R11.2,0.589249,K29.5,0.53433
14005633,K65,0.401702,B50,0.355547,,,,,,
24000550,B50,0.612482,K65,0.533676,,,,,,
24002181,S36.8,0.786977,,,,,,,,
24000721,J45.909,0.746794,V89.2,0.723068,T79.6,0.59259,,,,
24000129,R60.9,0.94135,I63.9,0.728296,H54,0.490848,,,,


In [None]:
# Generate column names for the exploded ICDs in cause{n}_icd10 and cause{n}_icd10_prob format
# This will be used in conjunction with the `output_icds_to_cols` fx
icd_column_names_mapping = {i: f"cause{i}_icd10" if i % 2 == 0 else f"cause{i}_icd10_prob" for i in range(PAIRS*2)}

df.output.apply(output_icds_to_cols).rename(columns=icd_column_names_mapping)

In [None]:
df[['cause1_icd10', 
    'cause1_icd10_prob', 
    'cause2_icd10', 
    'cause2_icd10_prob', 
    'cause3_icd10', 
    'cause3_icd10_prob', 
    'cause4_icd10', 
    'cause4_icd10_prob', 
    'cause6_icd10', 
    'cause6_icd10_prob']] = np.nan


In [None]:
df