In [1]:
"""
# parse_json.ipynb

This code parses JSON data created by the file that generates responses using the OpenAI API. 
Its primary function is to extract ICD-10 codes and their associated probabilities from the JSON data 
and format the output into a pandas DataFrame.

The JSON data contains the following fields:

- `rowid`: A unique identifier for each row, similar to the identifier used in HEALSL data.
- `cause1_icd10`: The primary ICD-10 code.
- `cause1_icd10_prob`: The probability associated with the primary ICD-10 code (range: 0-1).
- `cause2_icd10`: The secondary ICD-10 code (optional).
- `cause2_icd10_prob`: The probability associated with the secondary ICD-10 code (optional).
- `cause3_icd10`: The tertiary ICD-10 code (optional).
- `cause3_icd10_prob`: The probability associated with the tertiary ICD-10 code (optional).
- `cause4_icd10`, `cause4_icd10_prob`, `cause5_icd10`, `cause5_icd10_prob`: 
        Additional ICD-10 codes and their associated probabilities (optional).
- `output_timestamp`: The timestamp when the output was generated.
- `output_model`: The model used to generate the output.
- `output_system_prompt`: The system prompt used to generate the output.
- `output_user_prompt`: The user prompt used to generate the output.
- `output_usage_completion_tokens`: The number of completion tokens used.
- `output_usage_prompt_tokens`: The number of prompt tokens used.
- `output_msg`: The raw output returned by the OpenAI API.

The `output_probs` field is a list of dictionaries. Each dictionary represents an ICD-10 code and associated data 
extracted from `output_msg`. Here's a breakdown of the dictionary structure:

- `icd`: An extracted ICD-10 code.
- `icd_linprob_mean`: The mean linear probability of the extracted ICD-10 code.
        This is calculated by converting the log probabilities of all tokens that compose the ICD-10 code 
        back to linear probabilities and then taking the mean.
- `logprobs`: A dictionary of all tokens that compose the ICD-10 code and their log probabilities.
        - `token`: The token itself.
        - `logprob`: The log probability of the token.

For example, given the following `output_msg`:
J18.9\nR60.9\nR10.4

The `output_probs` produced is:
[{'icd': 'J18.9',
  'icd_linprob_mean': 0.7436831741851213,
  'logprobs': {
        'token': ['J', '18', '.', '9'],
        'logprob': [-0.09348458, -0.4651886, -0.8219949, -0.0035963869]}},
 {'icd': 'R60.9',
  'icd_linprob_mean': 0.760383776928103,
  'logprobs': {
        'token': ['R', '60', '.', '9'],
        'logprob': [-0.88291967, -0.20200534, -0.0005493374, -0.20896938]}},
 {'icd': 'R10.4',
  'icd_linprob_mean': 0.6912214480428616,
  'logprobs': {
        'token': ['R', '10', '.', '4'],
        'logprob': [-0.69898874, -1.1874909, -5.6769813e-06, -0.03789068]}}]

The first dictionary in the list represents the ICD-10 code 'J18.9'. 
The mean linear probability of this code is approximately 0.744. 
The log probabilities of the individual tokens 'J', '18', '.', and '9' 
are -0.093, -0.465, -0.822, and -0.004, respectively.
"""
pass

In [2]:
import os
import pandas as pd
import numpy as np
import json
import re
from datetime import datetime

# return the current date and time as a string
def get_datetime_string():
    return datetime.now().strftime('%Y%m%d_%H%M')


# Define the name of the data file
# DATA_FILE = "data_storage.json"
DATA_FILE = "response_validation_data_storage.json"

JSON_EXPORT_FILE =  f"resopnse_validation_parsed_{get_datetime_string()}.json"
CSV_EXPORT_FILE =   f"resopnse_validation_parsed_{get_datetime_string()}.csv"

# Define the number of paired ICDs and probabilities we want to capture
PAIRS = 5


In [3]:
# F(x): Initialize the data storage dictionary

def load_data(filename=DATA_FILE):
    """
    Loads data from a JSON file.

    This function checks if a file with the given filename exists. If it does, it opens the file, 
    loads the JSON data from it, and returns this data. If the file does not exist, it prints a message 
    and returns an empty dictionary.

    Args:
        filename (str, optional): The name of the file to load data from. Defaults to DATA_FILE.

    Returns:
        dict: The loaded data if the file exists, otherwise an empty dictionary.
    """
    if os.path.exists(filename):
        print(f"{filename} found. Loading data...")
        with open(filename, 'r') as file:
            data = json.load(file)
        return data
    else:
        print(f"{filename} not found. Initializing empty dictionary...")
        return {}

def save_data(data, filename=DATA_FILE):
    """
    Saves data to a JSON file.

    This function opens a file with the given filename in write mode and writes the data to it in JSON format.

    Args:
        data (dict): The data to be saved.
        filename (str, optional): The name of the file to save data to. Defaults to DATA_FILE.
    """
    with open(filename, 'w') as file:
        json.dump(data, file)

In [4]:
# F(x): Extract ICD probabilities from tokens

def extract_icd_probabilities(logprobs, debug=False):
    """
    Extracts ICD-10 codes and their associated probabilities from a list of tokens and log probabilities.

    This function iterates over the list of tokens and log probabilities, concatenating tokens together 
    and checking if they match the pattern of an ICD-10 code. If a match is found, it calculates the mean 
    linear probability of the ICD-10 code and packages the ICD-10 code, mean linear probability, and 
    associated tokens and log probabilities into a dictionary. It then appends this dictionary to a list 
    of parsed ICD-10 codes.

    Args:
        logprobs (list): A list of lists, where each inner list contains a token and its associated log probability.
        debug (bool, optional): If set to True, the function prints debug information. Defaults to False.

    Returns:
        list: A list of dictionaries, where each dictionary contains an ICD-10 code, its mean linear probability, 
              and a dictionary of associated tokens and log probabilities.
    """
    parsed_icds = []
    tmp_df = pd.DataFrame(logprobs)
    if debug > 0:
        print(repr(''.join(tmp_df.iloc[:,0])))
    tmp_df_limit = len(tmp_df)
    for pos in range(tmp_df_limit):
        # Concatenate 2, 4, or 5 tokens to form ICD-10 codes
        temp_concat_ANN = ''.join(tmp_df.iloc[pos:pos+2, 0]).strip()
        temp_concat_ANN_NNN = ''.join(tmp_df.iloc[pos:pos+4, 0]).strip()
        temp_concat_ANN_NNN_A = ''.join(tmp_df.iloc[pos:pos+5, 0]).strip()
        temp_concat_ANA_NNN = ''.join(tmp_df.iloc[pos:pos+5, 0]).strip()
        
        # Reference: https://www.webpt.com/blog/understanding-icd-10-code-structure
        
        # Regular expression pattern for various ICD-10 codes in the format
        # 'ANN' (e.g., 'A10')
        # 'ANN.NNN' (e.g., 'A10.001')
        # 'ANN.NNNA' (e.g., 'A10.001A') 
        # Note: last alphabet valid only if there are 6 characters before it
        # pattern_ANN = r"^[A-Z]\d{2}$"
        pattern_ANN = r"^[A-Z]\d[0-9A-Z]$"
        # pattern_ANN_NNN = r"^[A-Z]\d{2}\.\d{1,3}$"        
        pattern_ANN_NNN = r"^[A-Z]\d[0-9A-Z]\.\d{1,3}$"        
        # pattern_ANN_NNN_A = r"^[A-Z]\d{2}\.\d{3}[A-Z]$"
        pattern_ANN_NNN_A = r"^[A-Z]\d[0-9A-Z]\.\d{3}[A-Z]$"        
        
        # Check if the concatenated tokens match the ICD-10 code patterns
        match_ANN = re.match(pattern_ANN, temp_concat_ANN)
        match_ANN_NNN = re.match(pattern_ANN_NNN, temp_concat_ANN_NNN)
        match_ANN_NNN_A = re.match(pattern_ANN_NNN_A, temp_concat_ANN_NNN_A)
        match_ANA_NNN = re.match(pattern_ANN_NNN, temp_concat_ANA_NNN)
        
        # [debug] Each line will show which of the 3 patterns matched for the 3 token
        if debug == 2:
            print(
                str(pos).ljust(4), 
                repr(temp_concat_ANN).ljust(10), 
                ('yes' if match_ANN else 'no').ljust(15), 
                repr(temp_concat_ANN_NNN).ljust(10), 
                ('yes' if match_ANN_NNN else 'no').ljust(15), 
                repr(temp_concat_ANN_NNN_A).ljust(10), 
                ('yes' if match_ANN_NNN_A else 'no').ljust(15),
                repr(temp_concat_ANA_NNN).ljust(10), 
                ('yes' if match_ANA_NNN else 'no').ljust(5)
                )
        
        # Check match from longest to shortest
        # If a match is found, calculate the mean linear probability 
        # and package the ICD-10 code and associated data
        if match_ANN_NNN_A:
            winning_df = pd.DataFrame(logprobs[pos:pos+5])
            winning_icd = temp_concat_ANN_NNN_A
        elif match_ANA_NNN:
            winning_df = pd.DataFrame(logprobs[pos:pos+5])
            winning_icd = temp_concat_ANA_NNN
        elif match_ANN_NNN:
            winning_df = pd.DataFrame(logprobs[pos:pos+4])
            winning_icd = temp_concat_ANN_NNN            
        elif match_ANN:
            winning_df = pd.DataFrame(logprobs[pos:pos+2])
            winning_icd = temp_concat_ANN            
        else:
            continue
        
        # [debug] Display the winning ICD-10 code and its associated data
        if debug == 2:
            print(f"**** {winning_icd} - VALID ICD ****")
            display(winning_df)
        
        # Convert log probabilities to linear probabilities and calculate the mean
        winning_mean = np.exp(winning_df.iloc[:, 1]).mean()
        
        # Package the ICD-10 code and associated data
        winning_package = {
            'icd': winning_icd,
            'icd_linprob_mean': winning_mean,
            'logprobs': winning_df.rename(columns={0: 'token', 1:'logprob'}).to_dict(orient='list')
        }
        
        # Append the package to the list of parsed ICD-10 codes
        parsed_icds.append(winning_package)
    
    # [debug] Display the parsed ICD-10 codes
    if debug > 0:
        display(parsed_icds) 
    
    # Check if parsed_icds is empty
    if not parsed_icds:
        # If it is, raise an error and show the logprobs in question
        raise ValueError(f"No ICD-10 codes could be parsed from the provided logprobs: {logprobs}")

    return parsed_icds

# # Uncomment the following lines to test the function. 
# # `test` is an example of the `logprobs` field from the JSON data.
# test = [['A', -0.63648945],  ['09', -1.4643841], ['\n', -0.9866263], ['R', -0.6599979], ['50', -1.5362289],
#  ['.', -0.05481864],  ['9', -0.002321772], ['\n', -0.3524723], ['R', -0.56709456], ['11', -1.263591],
#  ['.', -0.05834798], ['0', -0.73551023], ['\n', -0.5051807], ['R', -0.65759194], ['63', -1.0282977],
#  ['.', -0.0006772888], ['4', -0.71002203]]

# test_output = extract_icd_probabilities(test)
# test_output

# # Uncomment to test a specific case
# extract_icd_probabilities(df.loc['24000015', 'logprobs'])


In [5]:
# Load JSON data and convert to dataframe
data_storage = load_data()
df = pd.DataFrame(data_storage).T

response_validation_data_storage.json found. Loading data...


In [6]:
# # Finding extract long outputs
# df[df.icds.apply(lambda x: len(x)) > 2].index

# # To speed up testing, we can limit rows with known abnormal output data
# df = df.loc[['14004747', '14002839', 
#              '14002323', '14001355', '14000201', '14005633',
#        '24000550', '24002181', '24000721', '24000129', '24000117', '24000186',
#        '14002203', '14006139', '24003520',
#        '14002421', '14009193', '24002598',
#        ]]
# df = df.sample(500)

In [7]:
# Extract ICD-10 codes and their associated probabilities as a new column
df['output_probs'] = df['logprobs'].apply(extract_icd_probabilities)

In [8]:
# # Showing `output_msg` that exceeds ICD length
# abnormal_output_df = df[df['output_msg'].apply(lambda x:len(x) > 8)][['output_msg']]
# print(f"{abnormal_output_df.shape[0]} rowids with output_msg exceeding normal ICD length")
# print("Example:")
# print(abnormal_output_df.head(5))
# # df[df['output_msg'].apply(lambda x:len(x) > 8)][['output_msg','icds','best_icd']]

In [9]:
# F(x): Given a list of ICDs in form of a list of tuples, convert each ICD into 1-dimension Series

def output_icds_to_cols(value, pairs=PAIRS):
    """
    Converts a list of ICD-10 codes and their associated probabilities into a one-dimensional pandas Series.

    This function takes a list of tuples, where each tuple contains an ICD-10 code and its associated 
    probability. It converts this list into a DataFrame, sorts the DataFrame by descending probability, 
    drops the 'logprobs' column, reshapes the DataFrame into a one-dimensional Series, and pads the Series 
    to fill a specified number of columns.

    Args:
        value (list): A list of tuples, where each tuple contains an ICD-10 code and its associated probability.
        pairs (int, optional): The number of columns to pad the Series to. Defaults to PAIRS.

    Returns:
        pandas.Series: A one-dimensional Series containing the ICD-10 codes and their associated probabilities.
    """
    tmp = pd.DataFrame(value) # convert list of tuples to dataframe
    tmp = tmp.sort_values(by="icd_linprob_mean", ascending=False) # sort by descending probability
    tmp = tmp.drop(columns=['logprobs'])
    tmp = tmp.stack().reset_index(drop=True) # convert to 1 row
    tmp = tmp.reindex(range(pairs*2), axis=1) # pad to fill PAIRS*2 columns
    return tmp

# Test
# output_icds_to_cols(test_output)

In [10]:
# Generate column names for the exploded ICDs in cause{n}_icd10 and cause{n}_icd10_prob format
icd_column_names_mapping = {i: f"cause{i // 2 + 1}_icd10" if i % 2 == 0 else f"cause{i // 2 + 1}_icd10_prob" for i in range(PAIRS*2)}

# Apply the `output_icds_to_cols` function to the `output_probs` column
# This will explode the ICDs into separate columns
parsed_df = df.merge(df.output_probs.apply(output_icds_to_cols).rename(columns=icd_column_names_mapping), left_index=True, right_index=True)

In [11]:
# Takes usage and extracts the first 2 values into separate columns
parsed_df = parsed_df.merge(
    parsed_df['usage'].apply(lambda x: pd.DataFrame(x).iloc[:2,1])
    .rename(columns={
        0: "output_usage_completion_tokens",
        1: "output_usage_prompt_tokens"
        }), left_index=True, right_index=True)

In [12]:
# Define the mapping variable
column_mapping = {
    'model': 'output_model',
    'system_prompt': 'output_system_prompt',
    'user_prompt': 'output_user_prompt',
    'user_prompt': 'output_user_prompt',
    'timestamp': 'output_created',
}

# Rename the columns using the mapping
parsed_df = parsed_df.rename(columns=column_mapping)


# Show only relevant columns in the final dataframe
export_parsed_df = parsed_df[
    ['rowid'] + 
    list(icd_column_names_mapping.values()) + 
    [
        'output_created',
        'output_model',
        'output_system_prompt' , 
        'output_user_prompt', 
        'output_usage_completion_tokens', 
        'output_usage_prompt_tokens', 
        'output_msg',
        'output_probs'
    ]
]

In [17]:
# Save the parsed data to a JSON file

# export_parsed_df.to_json(JSON_EXPORT_FILE, orient='records')
export_parsed_df.to_csv(CSV_EXPORT_FILE, index=True)

In [None]:
# F(x): Calculate the mean linear probability given a list of token and log probabilities.
def calculate_mean_exp(data):
    return np.mean(np.exp(pd.DataFrame(data).iloc[:,1]))

# # Test
# quick_test_data = [
#     ["V", -0.80707335],
#     ["89", -0.5674744],
#     [".", -0.07485282],
#     ["2", -0.049951375],
# ]
# print(calculate_mean_exp(quick_test_data))