In [1]:
"""
    This script splits the JSON into multiple files, according to ROWID_SPLITS use to separate round 1 and round 2 data.
"""

'\n    This script splits the JSON into multiple files, according to ROWID_SPLITS use to separate round 1 and round 2 data.\n'

In [2]:
import os
import pandas as pd
import numpy as np
import json
import re
import logging

logging.basicConfig(level=logging.INFO)

# Define the name of the data file
WORKING_DIR = "_working_data_240315"
EXPORT_DIR = f"./{WORKING_DIR}/healsl_rd1to2_rapid_gpt_v2b_2024_03_15"

# Create the export directory if it does not exist
if not os.path.exists(EXPORT_DIR):
    print(f"Creating directory: {EXPORT_DIR}")
    os.makedirs(EXPORT_DIR)
    

# Show working files
files = [file for file in os.listdir('./_working_data_240315') if file.startswith('03_') and 'sampled' not in file]
display(files)


# IMPORT_CSV_FILE = "./_working_data_240315/all_data_gpt4_0313_parsed_first_ICD.csv"

VERSION = "v2b"

export_filename_template = "healsl_ROUND_rapid_MODELNAME_{VERSION}{OPTIONS}.csv"
export_filename_template = export_filename_template.replace("{VERSION}", VERSION)




Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


['03_(all)_gpt4_0313_parsed_sorted_ICD.csv',
 '03_(all)_gpt3_0309_parsed_first_ICD.csv',
 '03_(all)_gpt3_0309_parsed_sorted_ICD.csv',
 '03_(all)_gpt4_0313_parsed_first_ICD.csv']

In [4]:
for filename in files:
    # df = pd.read_csv(f"{WORKING_DIR}/{filename}")

    modelname = re.search(r'gpt\d', filename).group(0)

    first_icd_option = None if re.search(r'first_ICD', filename) is None else re.search(r'first_ICD', filename).group(0)
    sorted_icd_option = None if re.search(r'sorted_ICD', filename) is None else re.search(r'sorted_ICD', filename).group(0)

    final_option = None
    if first_icd_option is not None:
        final_option = "_" + first_icd_option
    elif sorted_icd_option is not None:
        final_option = "_" + sorted_icd_option

    export_filename = export_filename_template.replace("MODELNAME", modelname).replace("{OPTIONS}", final_option)
    export_filename = f"{EXPORT_DIR}/{export_filename}"

    import_csv_file = f"./{WORKING_DIR}/{filename}"
    logging.info(f"Reading file: {import_csv_file}")
    logging.info(f"model name: {modelname}")
    logging.info(f"first icd option: {first_icd_option}")
    logging.info(f"sorted icd option: {sorted_icd_option}")
    df = pd.read_csv(import_csv_file)

    # Split the data into round 1 and round 2
    rd1_df = df[df['round'] == "rd1"]
    rd2_df = df[df['round'] == "rd2"]

    # Export the data
    round1_filename = export_filename.replace("ROUND", "rd1")
    round2_filename = export_filename.replace("ROUND", "rd2")

    logging.info(f"Exporting round 1 data to: {round1_filename}")
    rd1_df.to_csv(export_filename.replace("ROUND", "rd1"), index=False)

    logging.info(f"Exporting round 2 data to: {round2_filename}")
    rd2_df.to_csv(export_filename.replace("ROUND", "rd2"), index=False)



INFO:root:Reading file: ./_working_data_240315/03_(all)_gpt4_0313_parsed_sorted_ICD.csv
INFO:root:model name: gpt4
INFO:root:first icd option: None
INFO:root:sorted icd option: sorted_ICD
INFO:root:Exporting round 1 data to: ./_working_data_240315/healsl_rd1to2_rapid_gpt_v2b_2024_03_15/healsl_rd1_rapid_gpt4_v2b_sorted_ICD.csv
INFO:root:Exporting round 2 data to: ./_working_data_240315/healsl_rd1to2_rapid_gpt_v2b_2024_03_15/healsl_rd2_rapid_gpt4_v2b_sorted_ICD.csv
INFO:root:Reading file: ./_working_data_240315/03_(all)_gpt3_0309_parsed_first_ICD.csv
INFO:root:model name: gpt3
INFO:root:first icd option: first_ICD
INFO:root:sorted icd option: None
INFO:root:Exporting round 1 data to: ./_working_data_240315/healsl_rd1to2_rapid_gpt_v2b_2024_03_15/healsl_rd1_rapid_gpt3_v2b_first_ICD.csv
INFO:root:Exporting round 2 data to: ./_working_data_240315/healsl_rd1to2_rapid_gpt_v2b_2024_03_15/healsl_rd2_rapid_gpt3_v2b_first_ICD.csv
INFO:root:Reading file: ./_working_data_240315/03_(all)_gpt3_0309_pa

In [None]:
df = pd.read_csv(IMPORT_CSV_FILE)

In [None]:
df[df['round'] == "rd1"].to_csv(f"{EXPORT_DIR}/healsl_rd1_rapid_gpt3_v2b.csv", index=False)
df[df['round'] == "rd2"].to_csv(f"{EXPORT_DIR}/healsl_rd2_rapid_gpt3_v2b.csv", index=False)



In [None]:
df[df.icd10_count > 2000].head(10)

In [None]:
df[df.icd10_count > 2000].rowid.tolist()


In [None]:
# Tools: Re-process df, eval from str -> list
import ast
df.output_probs = df.output_probs.apply(ast.literal_eval)

In [None]:
# Tool: Count the number of elements in the list

# df.output_probs.apply(lambda x: len(x)).value_counts(ascending=)
print(f"Number of ICDs returned per record, binned:")
df.output_probs.apply(lambda x: len(x)).value_counts(bins=[0,1,2,3,4,5,6,50,100,500,1000,3000, df.shape[0]], sort=False)