In [4]:
# Import the necessary libraries
import pandas as pd
from openai import OpenAI
import json
import os
from tenacity import retry,stop_after_attempt, wait_random_exponential

In [5]:
# Load the data
df = pd.read_csv("mtsamples.csv")
print(df.head())

   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                             2-D M-Mode. Doppler.     
4           4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL 

In [6]:
# OpenAI API specifications
client = OpenAI(api_key=os.environ.get('OPENAI_KEY'))
model = "gpt-4o-mini"

In [7]:
# System guidelines
system_prompt = ("You are a highly knowledgeable and efficient assistant designed to extract specific and relevant information from medical transcripts. Your primary goal is to identify key medical information accurately and concisely. Ensure the information is free of any irrelevant details, maintains confidentiality and respects patient privacy. Don't make assumptions about what values to plug unto functions. Don't make up values to fill the response with.")

In [8]:
# Defining function calling for the completions endpoint
function_definition = [
    {
    'type': 'function',
    'function':{
        'name': "extract_medical_info",
        "description": "Extract the medical information from the input medical transcipt.",
        "parameters": {
            'type': "object",
            'properties': {
                'medical_speciality': {"type": "string", "description": "The medical speciality the transcript belongs to."},
                'age': {"type": "string", "description": "The age of the patient."},
                'recommended_treatment': {"type": "string", "description": "The recommended treatment of the patient."}},
            'required': ["medical_speciality", "age", "recommended_treatment"]
        }}},
    {
    'type': 'function',
    'function':{
        'name': "find_icd_code",
        "description": "Return the International Classification of Diseases (ICD) code corresponding to the disease in the input medical transcipt. Return NA if no ICD code can be determined.",
        "parameters": {
            'type': "object",
            'properties': {
                'disease': {"type": "string", "description": "The patient's disease."},
                'icd_code': {"type": "string", "description": "The ICD code of the medical disease."}
            },
            'required': ["disease", "icd_code"]
        }}}
]

In [9]:
# The chat completions function to extract required info
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def extract_info(dataframe):
    extracted_data = []

    for index, row in dataframe.iterrows():
        input_text = f"Medical Specialty: {row['medical_specialty']}\nDescription: {row['description']}\nTranscription: {row['transcription']}"
        
        try:
            response = client.chat.completions.create(
                model=model, 
                max_tokens=200,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": input_text}
                ],
                tools=function_definition,
                temperature=0
            )
            
            if response.choices[0].finish_reason == 'tool_calls':
                
                extracted_info = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
                medical_speciality = extracted_info['medical_speciality']
                patient_age = extracted_info['age']
                recommended_treatment = extracted_info['recommended_treatment']
                
                icd_info = json.loads(response.choices[0].message.tool_calls[1].function.arguments)
                disease = icd_info['disease']
                icd_code = icd_info['icd_code']
                
                extracted_data.append({
                    "medical_specialty": medical_speciality,
                    "patient_age": patient_age,
                    "disease": disease,
                    "icd_code": icd_code,
                    "recommended_treatment": recommended_treatment
                })

        except openai.AuthenticationError as e:
            print(f"OpenAI API failed to authenticate: {e}")
            pass
        except openai.RateLimitError as e:
            print(f"OpenAI API request exceeded rate limit: {e}")
            pass
        except Exception as e:
            print(f"Unable to generate a response. Exception: {e}")
            pass

    return extracted_data

In [None]:
extracted_data = extract_info(df)

In [None]:
# Save the output to a pandas dataframe
df_structured = pd.DataFrame(extracted_data)

In [None]:
# Export to a csv file
df_structured.to_csv("structured_transcriptions.csv", index=False)