In [2]:
# Import the necessary libraries
import os
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
import json
load_dotenv()

# Create client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
# Load the data
df = pd.read_csv("../data/synthetic_transcriptions_chaotic_v2.csv")
df.head()
df['transcription'].head()

0    SUBJECTIVE: sleep apnea.\n\nOBJECTIVE: AGE:27 ...
1    PREOPERATIVE DIAGNOSIS , urticaria.  POSTOPERA...
2    HPI: COPD flare. PMH: 89yo female here for eva...
3    PREOPERATIVE DIAGNOSIS: gastric  bypass follow...
4    PREOPERATIVE DIAGNOSIS , hypothyroidism. POSTO...
Name: transcription, dtype: object

In [4]:
# Writing out the three extraction functions. Age, medical specialty, and the recommended treatment

function_definition = [{"type": "function",
                       "function": {
                           "name": "get_age",
                           "description": "This function extracts the age of the patient from the transcription column.",
                           "parameters": {
                               "type": "object",
                               "properties": {
                                   "age" : {"type": "integer", "description": "Age"}
                               },
                           "required": ["age"],
                           "additionalProperties": False
                           }
                       }}]

# Function 2 = Medical Specialty
function_definition.append({"type": "function",
                           "function": {
                               "name": "get_specialty",
                               "description": "This function extracts the medical specialty from the medical_specialty column",
                               "parameters": {
                                   "type": "object",
                                   "properties": {
                                   "medical specialty": {"type": "string", "description": "medical specialty"}}
                               }
                           }})

# Function 3 = Recommended Treatment

function_definition.append({"type": "function",
                           "function": {
                               "name": "get_treatment",
                               "description": "This function extracts the recommended treatment from the transcription column.",
                               "parameters": {
                                   "type": "object",
                                   "properties": {
                                       "recommended treatment": {"type": "string", "description": "recommended treatment"}
                                   }
                               }
                           }})

In [5]:
# Define empty dataframe = df_structured
df_structured = pd.DataFrame()

In [6]:
# Testing the first Function
ages = []
for text in df["transcription"]:
    row_messages = [{"role": "system", "content": "Your role here is to extract specific data from a given JSON text using the defined functions, like 'get_age', 'get_specialty', and 'get_treatment', to create a new JSON text with the desired extracted data"},
                   {"role": "user", "content":text}]
    response = client.chat.completions.create(
    model = "gpt-4o-mini",
    messages = row_messages,
    tools = function_definition,
    tool_choice = "auto"
)
    args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
    ages.append(args.get("age"))
df_structured["age_extracted"] = ages
print(df_structured.head())

   age_extracted
0             27
1             52
2             89
3             84
4             85


In [7]:
# Testing all Three Functions, Together
ages = []
specialties = []
treatments = []
for x, y in zip(df["medical_specialty"], df["transcription"]):
    messages_age = [
        {"role": "system", "content": "Extract the patient's age (integer) in years."},
        {"role": "user", "content": y}
    ]
    response_age = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_age,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_age"}},
        temperature=0
    )
    args = json.loads(response_age.choices[0].message.tool_calls[0].function.arguments)
    ages.append(args.get("age"))

    # Extract Specialty
    messages_specialty = [
        {"role": "system", "content": "Extract the medical specialty."},
        {"role": "user", "content": x}
    ]
    response_specialty = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_specialty,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_specialty"}},
        temperature=0
    )
    args = json.loads(response_specialty.choices[0].message.tool_calls[0].function.arguments)
    specialties.append(args.get("medical specialty"))

    # Extract Treatment
    messages_treatment = [
        {"role": "system", "content": "Extract the medical treatment suggested."},
        {"role": "user", "content": y}
    ]
    response_treatment = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_treatment,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_treatment"}},
        temperature=0
    )
    args = json.loads(response_treatment.choices[0].message.tool_calls[0].function.arguments)
    treatments.append(args.get("recommended treatment"))

df_structured["age_extracted"] = ages
df_structured["specialty_extracted"] = specialties
df_structured["treatments_extracted"] = treatments
print(df_structured.head())
print(df_structured["specialty_extracted"].head(10))
print(df_structured["treatments_extracted"].head(10))

   age_extracted         specialty_extracted  \
0             27  Cardiovascular / Pulmonary   
1             52        Allergy / Immunology   
2             89  Cardiovascular / Pulmonary   
3             84                  Bariatrics   
4             85               Endocrinology   

                                treatments_extracted  
0  OTC meds, order sleep study, discuss CPAP, wei...  
1                              fexofenadine 180mg qd  
2  duonebs, steroids, consider antibiotics if spu...  
3  advance diet as tolerated; vitamin supplementa...  
4  levothyroxine start/adjust; recheck labs in 6-...  
0    Cardiovascular / Pulmonary
1          Allergy / Immunology
2    Cardiovascular / Pulmonary
3                    Bariatrics
4                 Endocrinology
5                   Dermatology
6                       Urology
7                     Neurology
8                     Neurology
9              Gastroenterology
Name: specialty_extracted, dtype: object
0    OTC meds, order

In [8]:
# Extracting the ICD Code
# Function Buildout
function_definition.append({"type": "function",
                           "function": {
                               "name": "get_icd",
                               "description": "This function searches up the respective ICD (Classification of Diseases) Code for the recommended treatment",
                               "parameters": {
                                   "type": "object",
                                   "properties": {
                                   "icd code": {"type": "string", "description": "icd code"}}
                               }}})


# for loop to extract ICD Code
icd = []

for x in df_structured["treatments_extracted"]:
    messages_icd = [
        {"role": "system", "content":
   ("You are a careful medical coding assistant. "
    "Input is a TREATMENT PLAN (meds/procedures). "
    "Infer the most likely diagnosis(es) that justify the plan and return ICD-10-CM code(s) only. "
    "Do NOT return CPT, RxNorm, ICD-10-PCS, or medication names. "
    "Prefer specific ICD-10-CM codes if the plan clearly implies an etiology; "
    "otherwise use a valid unspecified code. Respond via tool call only.")},
  {"role": "user", "content": x}
    ]
    response_icd = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_icd,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_icd"}},
        temperature=0
    )
    args = json.loads(response_icd.choices[0].message.tool_calls[0].function.arguments)
    icd.append(args.get("icd code"))

df_structured["icd_code"] = icd

print(df_structured.head())

   age_extracted         specialty_extracted  \
0             27  Cardiovascular / Pulmonary   
1             52        Allergy / Immunology   
2             89  Cardiovascular / Pulmonary   
3             84                  Bariatrics   
4             85               Endocrinology   

                                treatments_extracted icd_code  
0  OTC meds, order sleep study, discuss CPAP, wei...   G47.30  
1                              fexofenadine 180mg qd    J30.9  
2  duonebs, steroids, consider antibiotics if spu...    J44.9  
3  advance diet as tolerated; vitamin supplementa...    E66.9  
4  levothyroxine start/adjust; recheck labs in 6-...    E03.9  
