# Acronym Expander

Author: Damian Curran

This is the main notebook for the paper "Optimization and deployment challenges of closed-source LLMs for clinical note abbreviation expansion". See readme.MD for more details.


#### Imports

In [None]:
from utils.tools import clear_predictions, get_accuracy, copy_corrections, get_full_set, get_main_set, get_dev_set, get_mapper, add_baseline, hard_list
from utils.inferences import run_inferences

#### Load data & set API

In [None]:
api_key = input("Enter OpenAI API key:")

In [None]:
full_set = get_full_set(path='./data/preprocessed_dataset_window_10.csv')
main_set = get_main_set(full_set)
small_set = get_dev_set(main_set, divider=10)
mapper = get_mapper('./data/labeled_sf_lf_map_DC.csv')
hard_set = main_set[main_set.index.isin(hard_list)]

# The relevant 'data' set needs to be commented out, depending on the Experiment:
# ------------------------------
# data = hard_set # Experiment 1
# data = small_set # Experiment 2
data = main_set # Experiment 3
# ------------------------------

add_baseline(data)

limit = len(data)
print("Data length:", len(data))
print("Limit:     :", limit)

#### Run Inferences (Experiments 1, 2 and 3):

In [None]:
# The relevant 'model' set needs to be commented out, depending on the model being used in the Experiment:
# ------------------------------
# model = 'text-davinci-002'
# model = 'text-davinci-003'
model = 'gpt-3.5-turbo'

# Prompt amendments are made manually in inferences.py prior to new round of inferences.

clear_predictions(data)
error_indices = [] # to keep track of any instances where parsing error occurs in post-processing.
data = run_inferences(data, mapper, api_key, error_indices, model = model, batch_size=5, limit=limit, verbose=True, save_freq=2000)
if len(error_indices) > 0:
    data_errors = run_inferences(data.loc[error_indices], mapper, api_key, error_indices, model = model, batch_size=1, limit=limit, verbose=False, save_freq=1000)
    copy_corrections(data_errors,data)
get_accuracy(data)

#### Generate results from dataframe and save as .csv:

In [None]:
from utils.tools import build_results
results = build_results(data,mapper)
results.head(43)

#### Run Inferences on rare_set (Experiment 4):

In [None]:
from utils.tools import get_full_set, get_rare_mapper
rare_set = get_full_set(path='./data/rare_data.csv',long=False)
rare_mapper = get_rare_mapper
limit = len(rare_set)

In [None]:
model = 'gpt-3.5-turbo'

clear_predictions(rare_set)
error_indices = []
data = run_inferences(rare_set, rare_mapper, api_key, error_indices, model = model, batch_size=5, limit=limit, verbose=True, save_freq=2000)
if len(error_indices) > 0:
    data_errors = run_inferences(rare_set.loc[error_indices], rare_mapper, api_key, error_indices, model = model, batch_size=1, limit=limit, verbose=True, save_freq=1000)
    copy_corrections(data_errors,rare_set)
get_accuracy(rare_set)