In [None]:
import random

random.seed(72343)

import pickle as pkl
from copy import deepcopy
from collections import defaultdict
from utils import *

In [None]:
# Generate HCPCS and CPT code dictionaries and embeddings
hcpcs_level_ii_codes_filepath = "../data/cpt_codes.csv"
all_hcpcs_filepath = "../data/Data_HCPCS.txt"

hcpcs_level_ii_df = pd.read_csv(hcpcs_level_ii_codes_filepath)
all_hcpcs_df = pd.read_csv(all_hcpcs_filepath)
all_hcpcs_df = all_hcpcs_df.rename(
    columns={"HCPCS": "Code", "DESCRIPTION": "Description"}
)
cpt_df = pd.concat([hcpcs_level_ii_df, all_hcpcs_df[["Code", "Description"]]])
cpt_df["combined"] = cpt_df["Code"] + ": " + cpt_df["Description"]

# create a code to description dictionary
cpt_code_to_desc = {
    key: value for key, value in zip(cpt_df["Code"], cpt_df["combined"])
}

# generate embeddings
# modifies the dataframe in place
generate_embeddings(cpt_df, "combined")


# Generate ICD code dictionaries and embeddings
icd_filepath = "../data/icd10_data.csv"
icd_df = pd.read_csv(icd_filepath)

# Create csv file
icd_df["combined"] = icd_df["ICD Code"] + ": " + icd_df["Description"]
icd_df = icd_df.rename(columns={"ICD Code": "Code"})

# create a code to description dictionary
icd_code_to_desc = {
    key: value for key, value in zip(icd_df["Code"], icd_df["combined"])
}

# generate embeddings
# modifies the dataframe in place
generate_embeddings(icd_df, "combined")

# Combine the icd and cpt dataframes
df = pd.concat([cpt_df, icd_df], ignore_index=True)
df = df.dropna()

df.to_parquet(
    "../processed_data/combined_codes_with_embeddings.parquet",
    index=False,
)

# create a combined code to description dictionary
code_to_desc = {**cpt_code_to_desc, **icd_code_to_desc}

# save the code to description dictionary
with open("../processed_data/code_to_desc.json", "w") as f:
    json.dump(code_to_desc, f)

In [None]:
# Load the medical coding questions from CPC exams
questions_filepath = "../data/questions.json"

with open(questions_filepath, "r") as file:
    # questions have been preprocessed and three different subsets have been created
    random_subsets = json.load(file)

In [None]:
prompt_001 = """
Following is a question related to Medical Coding from the CPC (Certified Professional Coder) certification exam. Please help me determine the correct answer from the given options based on AAPC guidelines and coding best practices. Provide an explanation under the heading EXPLANATION and the correct answer under the heading ANSWER.

Question: ```{question_string}```"""

prompt_for_notes_001 = """
Following is a question related medical coding from the CPC (Certified Professional Coder) certification exam. Based on provided note about a patient's visit, progress report or operative procedure, please help me determine the correct answer from the given options based on AAPC guidelines and coding best practices in a step by step manner. Provide an explanation under the heading EXPLANATION and the correct answer under the heading ANSWER.

Note: ```{note}```

Question: ```{question_string}```"""


runs = 5
for setnum, subset in enumerate(random_subsets):
    for run in range(runs):
        results_file = f"random_set_{setnum}_{run}.csv"
        results_dir = "../result_rag"

        await async_run_questions_from_json(
            test_questions=subset,
            results_file=results_file,
            results_dir=results_dir,
            user_prompt_template=prompt_001,
            user_prompt_template_with_note=prompt_for_notes_001,
            system_prompt=None,
            temperature=0,
        )

In [None]:
# Run evaluation on the generated results
runs = 5
for setnum, subset in enumerate(random_subsets):
    accuracies = []
    for run in range(runs):
        results_file = f"random_set_{setnum}_{run}.csv"
        results_dir = "../result_rag"

        acc, results_df = eval_accuracy(
            os.path.join(results_dir, results_file),
            eval_by_category=False,
            print_overall_accuracy=False,
        )
        accuracies.append(acc)
    print(f"Set {setnum}: {np.mean(accuracies)} (+/-{np.round(np.std(accuracies), 2)})")