## Project on Extracting Data Using Tool Calling (OpenAI)
Please note that transcriptions.csv is unavailable because this is from an online course. 

In [None]:
# Import the necessary libraries
import pandas as pd
from openai import OpenAI
import json
load_dotenv()

# Create client
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [None]:
# Load the data
df = pd.read_csv("data/transcriptions.csv")
df.head()
df['transcription'].head()

In [None]:
# Writing out the three extraction functions. Age, medical specialty, and the recommended treatment

function_definition = [{"type": "function",
                       "function": {
                           "name": "get_age",
                           "description": "This function extracts the age of the patient from the transcription column.",
                           "parameters": {
                               "type": "object",
                               "properties": {
                                   "age" : {"type": "integer", "description": "Age"}
                               },
                           "required": ["age"],
                           "additionalProperties": False
                           }
                       }}]

# Function 2 = Medical Specialty
function_definition.append({"type": "function",
                           "function": {
                               "name": "get_specialty",
                               "description": "This function extracts the medical specialty from the medical_specialty column",
                               "parameters": {
                                   "type": "object",
                                   "properties": {
                                   "medical specialty": {"type": "string", "description": "medical specialty"}}
                               }
                           }})

# Function 3 = Recommended Treatment

function_definition.append({"type": "function",
                           "function": {
                               "name": "get_treatment",
                               "description": "This function extracts the recommended treatment from the transcription column.",
                               "parameters": {
                                   "type": "object",
                                   "properties": {
                                       "recommended treatment": {"type": "string", "description": "recommended treatment"}
                                   }
                               }
                           }})

In [None]:
# Define empty dataframe = df_structured
df_structured = pd.DataFrame()

In [None]:
# Testing the first Function
ages = []
for text in df["transcription"]:
    row_messages = [{"role": "system", "content": "Your role here is to extract specific data from a given JSON text using the defined functions, like 'get_age', 'get_specialty', and 'get_treatment', to create a new JSON text with the desired extracted data"},
                   {"role": "user", "content":text}]
    response = client.chat.completions.create(
    model = "gpt-4o-mini",
    messages = row_messages,
    tools = function_definition,
    tool_choice = "auto"
)
    args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
    ages.append(args.get("age"))
df_structured["age_extracted"] = ages
print(df_structured.head())

In [None]:
# Testing all Three Functions, Together
ages = []
specialties = []
treatments = []
for x, y in zip(df["medical_specialty"], df["transcription"]):
    messages_age = [
        {"role": "system", "content": "Extract the patient's age (integer) in years."},
        {"role": "user", "content": y}
    ]
    response_age = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_age,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_age"}},
        temperature=0
    )
    args = json.loads(response_age.choices[0].message.tool_calls[0].function.arguments)
    ages.append(args.get("age"))

    # Extract Specialty
    messages_specialty = [
        {"role": "system", "content": "Extract the medical specialty."},
        {"role": "user", "content": x}
    ]
    response_specialty = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_specialty,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_specialty"}},
        temperature=0
    )
    args = json.loads(response_specialty.choices[0].message.tool_calls[0].function.arguments)
    specialties.append(args.get("medical specialty"))

    # Extract Treatment
    messages_treatment = [
        {"role": "system", "content": "Extract the medical treatment suggested."},
        {"role": "user", "content": y}
    ]
    response_treatment = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_treatment,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_treatment"}},
        temperature=0
    )
    args = json.loads(response_treatment.choices[0].message.tool_calls[0].function.arguments)
    treatments.append(args.get("recommended treatment"))

df_structured["age_extracted"] = ages
df_structured["specialty_extracted"] = specialties
df_structured["treatments_extracted"] = treatments
print(df_structured.head())
print(df_structured["specialty_extracted"].head(10))
print(df_structured["treatments_extracted"].head(10))

In [None]:
# Extracting the ICD Code
# Function Buildout
function_definition.append({"type": "function",
                           "function": {
                               "name": "get_icd",
                               "description": "This function searches up the respective ICD (Classification of Diseases) Code for the recommended treatment",
                               "parameters": {
                                   "type": "object",
                                   "properties": {
                                   "icd code": {"type": "string", "description": "icd code"}}
                               }}})


# for loop to extract ICD Code
icd = []

for x in df_structured["treatments_extracted"]:
    messages_icd = [
        {"role": "system", "content":
   ("You are a careful medical coding assistant. "
    "Input is a TREATMENT PLAN (meds/procedures). "
    "Infer the most likely diagnosis(es) that justify the plan and return ICD-10-CM code(s) only. "
    "Do NOT return CPT, RxNorm, ICD-10-PCS, or medication names. "
    "Prefer specific ICD-10-CM codes if the plan clearly implies an etiology; "
    "otherwise use a valid unspecified code. Respond via tool call only.")},
  {"role": "user", "content": x}
    ]
    response_icd = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_icd,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_icd"}},
        temperature=0
    )
    args = json.loads(response_icd.choices[0].message.tool_calls[0].function.arguments)
    icd.append(args.get("icd code"))

df_structured["icd_code"] = icd

print(df_structured.head())

Mistakes:

Writing out messages for response instructions - THIS IS UNNECESSARY BECAUSE FEW SHOT PROMPTING IS AN ALTERNATIVE TO TOOL CALLING... AND I'M USING TOOL CALLING IN THIS PROBLEM - SEE BELOW

messages = [{"role": "system", "content": "Your role here is to extract specific data from a given JSON text using the defined functions, like 'get_age', 'get_specialty', and 'get_treatment', to create a new JSON text with the desired extracted data"},
           {"role": "user", "content": """Please analyze the following data and extract the relevant information: {"index": "0",	"medical_specialty"	: "Allergy / Immunology",	"transcription": "SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,Lungs:  Clear.,ASSESSMENT:,  Allergic rhinitis.,PLAN:,1.  She will try Zyrtec instead of Allegra again.  Another option will be to use loratadine.  She does not think she has prescription coverage so that might be cheaper.,2.  Samples of Nasonex two sprays in each nostril given for three weeks.  A prescription was written as well."}"""},
           {"role": "assistant", "content": """{"age":"23","medical specialty": "Allergy/ Immunology", "recommended treatment": "Try Zyrter instead of Allegra again. Or use Loratidine."}"""}]

Explained Further:
You’re mixing up few-shot prompting (where you give examples in the messages list) with tool-calling (where you already give the model a schema to fill).

1. With tool calling

When you define a tool like:

function_definition = [{
  "type": "function",
  "function": {
    "name": "get_age",
    "parameters": {
      "type": "object",
      "properties": {"age": {"type": "integer"}},
      "required": ["age"]
    }
  }
}]


…the model already knows it must give you an age. You don’t have to show it lots of prior Q&A examples; the schema is the example.

So you can keep your messages very simple:

row_messages = [
  {"role": "system", "content": "Return the patient's age in whole years. If unknown, use -1."},
  {"role": "user",   "content": text}  # <-- the transcription from THIS row
]

2. If you still want to give “demo” examples (few-shot)

Yes, you can append earlier examples into messages. For instance:

base_messages = [
  {"role": "system", "content": "Extract patient age."},
  {"role": "user", "content": "Pt is a 23-year-old female."},
  {"role": "assistant", "content": '{"age": 23}'}
]


Then, for each new row, you extend:

for text in df["transcription"]:
    row_messages = base_messages + [
        {"role": "user", "content": text}
    ]
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=row_messages,
        tools=function_definition,
        tool_choice={"type": "function", "function": {"name": "get_age"}},
        temperature=0
    )
    # parse args...


Notice: you don’t add new "assistant" examples during the loop (that would just teach it to parrot your answers). You just replay the base examples plus the current row’s text.

More Mistakes:

 Test Function, and Start Building Out Response Model

 This row_messages is the alternative to the few-shot prompting that I pursued earlier. This way... "text" is treated as a variable and can be looped over in the for loop I create later

 row_messages = [{"role": "system", "content": "Your role here is to extract specific data from a given JSON text using the defined functions, like 'get_age', 'get_specialty', and 'get_treatment', to create a new JSON text with the desired extracted data"},

 I HAVE TO PLUG IN THIS ROW MESSAGES AND RESPONSE MODEL DIRECTLY INTO THE FOR LOOP FOR IT TO WORK

 response = client.chat.completions.create(
    model = "gpt-4o-mini",
    messages = row_messages,
    tools = function_definition,
    tool_choice = "auto"
)