In [51]:
# Converts JSON file to a dataframe format outlined by the research paper.

In [52]:
import os
import pandas as pd
import numpy as np
import textwrap
import datetime
import pytz
import json
import re

DATA_FILE = "data_storage.json"


In [53]:
# F(x): Initialize the data storage dictionary

def load_data(filename=DATA_FILE):
    
    if os.path.exists(filename):
        print(f"{filename} found. Loading data...")
        with open(filename, 'r') as file:
            data = json.load(file)
        return data
    else:
        print(f"{filename} not found. Initializing empty dictionary...")
        return {}

def save_data(data, filename=DATA_FILE):
    # Save data to a file    
    with open(filename, 'w') as file:
        json.dump(data, file)

In [90]:
# F(x): Extract ICD probabilities from tokens

def extract_icd_probabilities(logprobs, output="simple"):
    parsed_icd = []
    parsed_icd_logprobs = []
    for pos in range(len(logprobs)):
        temp_df = pd.DataFrame(logprobs[pos: pos+4])
        temp_df = temp_df[temp_df[0].notna() & (temp_df[0].str.strip() != '')]
        temp_df = temp_df[temp_df[0].str.strip() != '\n']
        temp_concat = ''.join(temp_df.iloc[:, 0]).strip()
        if len(temp_concat) > 9:
            continue
        # pattern = r'^[A-Z]\d{0,4}(\.\d{0,4})?$'
        pattern_4part = r'^[A-Z]\d{0,4}(\.\d{1,4})?$'
        match = re.match(pattern_4part, temp_concat)

        if match:
            # print(f"{temp_concat} - valid ICD {np.round((np.exp(temp_df.iloc[:, 1]).mean())*100,2)}%")
            # print(f"**** {temp_concat} - VALID 2-parts ICD ****")
            parsed_icd.append((temp_concat, (np.exp(temp_df.iloc[:, 1]).mean())))
            # display(temp_df)
            parsed_icd_logprobs.append(temp_df.rename(columns={0: 'token', 1:'logprob'}).to_dict())
        else:
            # print(f"{temp_concat} - invalid 4-parts.")
            
            #trying 2-parts
            temp_df = pd.DataFrame(logprobs[pos: pos+2])
            temp_concat = ''.join(temp_df.iloc[:, 0]).strip()
            pattern_2part = r'^[A-Z]\d{1,4}$'
            match = re.match(pattern_2part, temp_concat)
            if match:
                # print(f"**** {temp_concat} - VALID 2-parts ICD ****")
                parsed_icd.append((temp_concat, (np.exp(temp_df.iloc[:, 1]).mean())))
                parsed_icd_logprobs.append(temp_df.rename(columns={0: 'token', 1:'logprob'}).to_dict())
            else:
                # print(f"{temp_concat} - invalid 2-parts.")
                pass
            pass
    if output == "logprobs":
        return parsed_icd, parsed_icd_logprobs
    else:
        return parsed_icd

In [60]:
# Load JSON data and convert to dataframe
data_storage = load_data()
df = pd.DataFrame(data_storage).T

# to speed up processing, we will sample 100 rows
df = df.sample(5)

data_storage.json found. Loading data...


In [61]:
# Showing `output_msg` that exceeds ICD length
abnormal_output_df = df[df['output_msg'].apply(lambda x:len(x) > 8)][['output_msg']]
print(f"{abnormal_output_df.shape[0]} rowids with output_msg exceeding normal ICD length")
print("Example:")
print(abnormal_output_df.head(5))
# df[df['output_msg'].apply(lambda x:len(x) > 8)][['output_msg','icds','best_icd']]

0 rowids with output_msg exceeding normal ICD length
Example:
Empty DataFrame
Columns: [output_msg]
Index: []


In [91]:
# df['icds'] = df.apply(lambda x: extract_icd_probabilities(x['logprobs'], output="logprobs"), axis=1)
df[['icds', 'parsed_icd_logprobs']] = df.apply(lambda x: pd.Series(extract_icd_probabilities(x['logprobs'], output="logprobs")), axis=1)
df['best_icd'] = df.apply(lambda x: pd.DataFrame(x['icds']).sort_values(by=1, ascending=False).iloc[0,0], axis=1)

df

Unnamed: 0,rowid,model,system_prompt,user_prompt,output_msg,logprobs,usage,timestamp,icds,parsed_icd_logprobs,best_icd
14006397,14006397,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",J06.9,"[[J, -0.00024180108], [06, -2.1696966], [., -0...","[[completion_tokens, 4], [prompt_tokens, 375],...",2024-02-15T16:50:48.691238-05:00,"[(J06.9, 0.7680458261659215)]","[{'token': {0: 'J', 1: '06', 2: '.', 3: '9'}, ...",J06.9
24003445,24003445,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",P76,"[[P, -0.25440565], [76, -0.11827476]]","[[completion_tokens, 2], [prompt_tokens, 448],...",2024-02-15T17:27:13.288819-05:00,"[(P76, 0.8319145572542279)]","[{'token': {0: 'P', 1: '76'}, 'logprob': {0: -...",P76
24000143,24000143,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",G40,"[[G, -0.24650367], [40, -0.4446733]]","[[completion_tokens, 2], [prompt_tokens, 448],...",2024-02-15T17:29:03.091295-05:00,"[(G40, 0.7112810804238539)]","[{'token': {0: 'G', 1: '40'}, 'logprob': {0: -...",G40
24003233,24003233,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",B05,"[[B, -0.023817662], [05, -0.008071461]]","[[completion_tokens, 2], [prompt_tokens, 426],...",2024-02-15T17:31:10.919369-05:00,"[(B05, 0.9842123828746623)]","[{'token': {0: 'B', 1: '05'}, 'logprob': {0: -...",B05
14005597,14005597,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",P95,"[[P, -0.009849582], [95, -0.15384461]]","[[completion_tokens, 2], [prompt_tokens, 378],...",2024-02-15T17:39:57.503452-05:00,"[(P95, 0.9238020045635259)]","[{'token': {0: 'P', 1: '95'}, 'logprob': {0: -...",P95


In [92]:
pd.DataFrame(df.parsed_icd_logprobs[0][0])

  pd.DataFrame(df.parsed_icd_logprobs[0][0])


Unnamed: 0,token,logprob
0,J,-0.000242
1,06,-2.169697
2,.,-0.038737
3,9,-0.003798


In [74]:
df.parsed_icd_logprobs[0][0]

  df.parsed_icd_logprobs[0][0]


'0,1\nJ,-0.00024180108\n06,-2.1696966\n.,-0.038737383\n9,-0.0037977037\n'

In [16]:
df[['cause1_icd10', 
    'cause1_icd10_prob', 
    'cause2_icd10', 
    'cause2_icd10_prob', 
    'cause3_icd10', 
    'cause3_icd10_prob', 
    'cause4_icd10', 
    'cause4_icd10_prob', 
    'cause6_icd10', 
    'cause6_icd10_prob']] = np.nan


In [25]:
df

Unnamed: 0,rowid,model,system_prompt,user_prompt,output_msg,logprobs,usage,timestamp,icds,best_icd
14002421,14002421,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",I10,"[[I, -0.21676947], [10, -1.5235081]]","[[completion_tokens, 2], [prompt_tokens, 375],...",2024-02-15T14:20:38.369528-05:00,"[(I10, 0.5115307596141437)]",I10
14005966,14005966,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",S06.5,"[[S, -0.05755939], [06, -0.111066304], [., -0....","[[completion_tokens, 4], [prompt_tokens, 294],...",2024-02-15T14:20:38.913306-05:00,"[(S06.5, 0.8143644855324024)]",S06.5
14001514,14001514,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",B54,"[[B, -0.0049721203], [54, -0.46672902]]","[[completion_tokens, 2], [prompt_tokens, 363],...",2024-02-15T14:20:39.477378-05:00,"[(B54, 0.8110451028130556)]",B54
14009193,14009193,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",I10,"[[I, -0.0077422047], [10, -0.15347986]]","[[completion_tokens, 2], [prompt_tokens, 539],...",2024-02-15T14:20:39.993528-05:00,"[(I10, 0.9250028637177969)]",I10
14002210,14002210,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",I64,"[[I, -0.0280739], [64, -0.6882239]]","[[completion_tokens, 2], [prompt_tokens, 366],...",2024-02-15T14:20:40.480268-05:00,"[(I64, 0.7373921099328329)]",I64
...,...,...,...,...,...,...,...,...,...,...
24002039,24002039,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",P91.0,"[[P, -0.0043068035], [91, -0.22353165], [., -0...","[[completion_tokens, 4], [prompt_tokens, 537],...",2024-02-15T21:35:43.111606-05:00,"[(P91.0, 0.6976464789585074)]",P91.0
24002598,24002598,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",P22.9,"[[P, -0.00081963453], [22, -0.07494948], [., -...","[[completion_tokens, 4], [prompt_tokens, 414],...",2024-02-15T21:35:43.618831-05:00,"[(P22.9, 0.7536376732273786)]",P22.9
24001849,24001849,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",P91,"[[P, -0.00037526153], [91, -1.1379366]]","[[completion_tokens, 2], [prompt_tokens, 277],...",2024-02-15T21:35:44.080042-05:00,"[(P91, 0.6600522132800508)]",P91
24000702,24000702,gpt-3.5-turbo-0125,You are a physician with expertise in determin...,"With the highest certainty, determine the unde...",P02.1,"[[P, -0.000562327], [02, -0.31716514], [., -0....","[[completion_tokens, 4], [prompt_tokens, 463],...",2024-02-15T21:35:45.138545-05:00,"[(P02.1, 0.8840350763999058)]",P02.1
