# Century Health Take Home

In [None]:
# imports

import json
import os
import pandas as pd

from tqdm import tqdm
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

### Extraction

In [2]:
# Load API key

load_dotenv()

API_KEY = os.getenv("API_KEY")
if not API_KEY:
    raise ValueError("API_KEY environment variable not set")

In [3]:
# Load in clinical notes

file_path = os.getcwd() 
raw_data = pd.read_csv(os.path.join(file_path, 'clinical_data.csv'))

In [4]:
# Extraction prompt

prompt_template = """
Extract the following clinical variables from the clinical note below:
- patient_name (string): The full name of the patient.
- age (integer): The age of the patient.
- gender (string): The gender of the patient.
- medical_record_number (integer): The medical record number of the patient.
- symptoms (list of strings): A list of symptoms reported by the patient. Make sure not to confuse with diagnoses and do not include irrelevant information. 
- diagnoses (list of strings): A list of diagnoses for the patient. Make sure not to confuse with symptoms and do not include irrelevant information. 
- medications (list of dictionaries): A list of medications, where each medication is a dictionary with keys "name" (string), "dosage" (string), and "frequency" (string).
- lab_results (dictionary): A dictionary of last a1c results, where the key is a1c (string) and value is the percentage a1c (string).
- next_steps (list of strings): A list of recommended next steps.

Clinical Note:
{clinical_note}

Return the extracted information in JSON format.
""" 


In [5]:
# Extract from LLM

model_responses = []

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=API_KEY)

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["clinical_note"],
    template=prompt_template
)

for clinical_note in tqdm(raw_data.note, total=len(raw_data.note)):
    formatted_prompt = prompt.format(clinical_note=clinical_note)

    # Send the prompt to the LLM
    response = llm.invoke(formatted_prompt) 
    model_responses.append(response)

  llm = ChatOpenAI(model="gpt-4o-mini", api_key=API_KEY)
100%|██████████| 20/20 [01:47<00:00,  5.36s/it]


In [6]:
# Json and schema verification functions

def is_valid_json(json_string):
    try:
        json.loads(json_string)
        return True
    except json.JSONDecodeError:
        # If parsing fails, it's not valid JSON
        return False
    

def validate_extraction(extracted_json):
    """
    Validate the extracted JSON to ensure all fields are populated as expected.
    """
    if type(extracted_json) == str:
        extracted_json = json.loads(extracted_json)
    # Define expected fields and their types
    expected_schema = {
        "patient_name": str,
        "age": int,
        "gender": str,
        "medical_record_number": int,
        "symptoms": list,
        "diagnoses": list,
        "medications": list,
        "lab_results": dict,
        "next_steps": list
    }

    validated_json = {}

    for field, expected_type in expected_schema.items():
        if field in extracted_json:
            # Check if the value matches the expected type
            if isinstance(extracted_json[field], expected_type):
                validated_json[field] = extracted_json[field]
            else:
                # Handle type mismatch
                print(f"Warning: Field '{field}' has incorrect type. Expected {expected_type}, got {type(extracted_json[field])}.")
                validated_json[field] = "unknown"
        else:
            # Handle missing field
            print(f"Warning: Field '{field}' is missing.")
            validated_json[field] = "unknown"

    return validated_json



In [7]:
# Verify extractions 

valid_extractions = []
for response in model_responses:
    cleaned_output = response.content.strip("`").strip("json")
    if is_valid_json(cleaned_output):
        valid_json = validate_extraction(cleaned_output)      
    else:
        valid_json = validate_extraction({}) # Load fields with unknown if extraction returns anything but json
    
    valid_extractions.append(valid_json) 

In [8]:
# Store results to disk

def save_json_objects(file_path, json_objects):
    with open(file_path, 'w') as file:
        for obj in json_objects:
            json.dump(obj, file)
            file.write('\n')

output_file_path = "extractions.json"
save_json_objects(output_file_path, valid_extractions)

In [9]:
# Visualize extractions

df_extractions = pd.DataFrame(data=valid_extractions)
df_extractions.head(10)

Unnamed: 0,patient_name,age,gender,medical_record_number,symptoms,diagnoses,medications,lab_results,next_steps
0,John Smith,58,male,1234,"[ongoing fatigue, occasional blurred vision, m...","[Type 2 Diabetes, hypertension, early peripher...","[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",{'a1c': '8.7%'},"[schedule a fasting glucose test in two weeks,..."
1,Linda Green,45,female,5678,"[persistent headaches, tingling in left foot, ...",[Type 2 Diabetes],"[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",{'a1c': '7.9%'},"[Continue current medication regimen, Order ro..."
2,Michael Brown,62,male,9102,[],"[hypertension, Type 2 Diabetes]","[{'name': 'Metformin', 'dosage': '1000 mg', 'f...",{'a1c': '8.1%'},[Increase Lisinopril dose if BP remains elevat...
3,Sarah Johnson,50,female,3344,"[Frequent urination, increased thirst, anxiety...","[Type 2 Diabetes, hyperlipidemia]","[{'name': 'Glipizide', 'dosage': '5 mg', 'freq...",{'a1c': '7.6%'},"[Reinforce medication adherence, Consider swit..."
4,Carlos Ramirez,55,male,2211,"[ongoing fatigue, left knee pain, disrupted sl...","[Type 2 Diabetes, hypertension]","[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",{'a1c': '9.2%'},[Increase Jardiance dosage if next labs still ...
5,Rebecca Lee,29,female,7788,"[rapid weight gain, frustration about constant...",[Type 2 Diabetes],"[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",{'a1c': '8.0%'},"[Encourage diet and exercise modifications, Sc..."
6,Thomas Wilson,67,male,9890,"[mild fatigue, anxiety about buffet options, c...","[chronic kidney disease (CKD) stage 3, Type 2 ...","[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",{'a1c': '8.5%'},"[Monitor renal function closely, Possibly cons..."
7,Emily Dawson,36,female,5566,"[increased thirst, frequent urination, mild an...",[Type 2 Diabetes],"[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",{'a1c': '9.0%'},[Stress the importance of medication adherence...
8,Robert Kim,48,male,3030,[occasional blurry vision at night],"[Type 2 Diabetes, hyperlipidemia]","[{'name': 'Metformin', 'dosage': '1000 mg', 'f...",{'a1c': 'not available'},[Order full blood panel including fasting gluc...
9,Diane Carter,60,female,1212,"[increasingly tired, annoying humming sound fr...","[Type 2 Diabetes, mild depression]","[{'name': 'Sertraline', 'dosage': '50 mg', 'fr...",{'a1c': '8.4%'},[Consider adjusting insulin dose if postprandi...


### Evaluation

In [10]:
# Evaluation prompt

eval_prompt_template = """
    You are a clinical expert, and your task is to assess the accuracy of entity extraction as a score in a given text. You will be given a clinical note, field name and the extracted value.
    Please provide a numeric score on a scale from 0 to 1, where 1 being the best score and 0 being the worst score. Strictly use numeric values for scoring. 

    Clinical Note: {clinical_note}
    Field Name: {field_name}
    Extracted Value: {extracted_value}

    Provide the score as a float in the format Score: score, a list of hallucinations (if any) in the format Hallucinations: [hallucination], and a brief explanation.
    """

In [11]:
# Evaluation functions

eval_llm = ChatOpenAI(model="gpt-4o-mini", api_key=API_KEY)

def evaluate_with_llm_field(clinical_note, field_name, extracted_value):
    eval_prompt = PromptTemplate(
    input_variables=["clinical_note", "field_name" "extracted_value"],
    template=eval_prompt_template
    )
    formatted_prompt = eval_prompt.format(field_name=field_name, clinical_note=clinical_note, extracted_value=extracted_value)
    # Send the prompt to the LLM
    response = eval_llm.invoke(formatted_prompt)

    return response

def parse_evaluations(model_response):
    response_text = model_response.content
    score = float(response_text.split()[1].strip(','))  # Extract the first number as the score
    hallucinations = []
    explanation = ""

    # Parse hallucinations and explanation
    if "Hallucinations:" in response_text:
        hallucinations = response_text.split("Hallucinations:")[1].split("Explanation:")[0].strip().split(", ")
    
    if "Explanation:" in response_text:
        explanation = response_text.split("Explanation:")[1].strip()

    return score, hallucinations, explanation

In [12]:
# Evaluation extractions

output_columns = ["medical_record_#", "field", "extracted_value", "score", "hallucinations", "explanation"]
df_results = pd.DataFrame(columns=output_columns)
for clinical_note, extracted_json in tqdm(zip(raw_data.note, valid_extractions), total=len(raw_data.note)):
    results = {}
    concerned_fields = []
    for field_name, extracted_value in extracted_json.items():
        model_response = evaluate_with_llm_field(clinical_note, field_name, extracted_value)
        score, hallucinations, explanation = parse_evaluations(model_response)
        new_row = pd.DataFrame([{
            "medical_record_#": extracted_json["medical_record_number"],
            "field": field_name,
            "extracted_value": extracted_value,
            "score": score,
            "hallucinations": hallucinations,
            "explanation": explanation
        }])

        df_results = pd.concat([df_results, new_row], ignore_index=True)
    


  df_results = pd.concat([df_results, new_row], ignore_index=True)
100%|██████████| 20/20 [05:37<00:00, 16.90s/it]


In [13]:
# Visualize evaluations

df_results.to_csv('evaluation_results.csv')
df_results.head(20)

Unnamed: 0,medical_record_#,field,extracted_value,score,hallucinations,explanation
0,1234,patient_name,John Smith,1.0,"[[],]","The extracted value ""John Smith"" accurately ma..."
1,1234,age,58,1.0,[[]],"The extracted value of ""58"" accurately reflect..."
2,1234,gender,male,1.0,[[]],"The extracted value ""male"" accurately reflects..."
3,1234,medical_record_number,1234,1.0,[[]],The extracted value for the medical record num...
4,1234,symptoms,"[ongoing fatigue, occasional blurred vision, m...",1.0,"[[],]",The extracted value accurately captures the sy...
5,1234,diagnoses,"[Type 2 Diabetes, hypertension, early peripher...",1.0,[[]],"The extracted diagnoses of 'Type 2 Diabetes', ..."
6,1234,medications,"[{'name': 'Metformin', 'dosage': '500 mg', 'fr...",1.0,[[]],The extracted medications accurately reflect t...
7,1234,lab_results,{'a1c': '8.7%'},1.0,[[]],The extracted value accurately reflects the la...
8,1234,next_steps,"[schedule a fasting glucose test in two weeks,...",1.0,[[]],The extracted value accurately reflects the ne...
9,5678,patient_name,Linda Green,1.0,[[]],"The extracted value ""Linda Green"" matches the ..."


In [14]:
# Hallucination check

df_results[df_results['score'] != 1.0]

Unnamed: 0,medical_record_#,field,extracted_value,score,hallucinations,explanation
22,9102,symptoms,[],0.0,[[]],"The extracted value for ""symptoms"" is an empty..."
24,9102,medications,"[{'name': 'Metformin', 'dosage': '1000 mg', 'f...",0.9,[[]],The extracted value accurately captures the me...
58,9890,symptoms,"[mild fatigue, anxiety about buffet options, c...",0.7,"[['anxiety about buffet options', 'complained ...","The extracted value correctly includes ""mild f..."
67,5566,symptoms,"[increased thirst, frequent urination, mild an...",0.9,[[]],The extracted symptoms 'increased thirst' and ...
79,3030,lab_results,{'a1c': 'not available'},0.5,[[]],The extracted value states that the A1C result...
85,1212,symptoms,"[increasingly tired, annoying humming sound fr...",0.8,[[]],"The extracted symptoms include ""increasingly t..."
121,8833,symptoms,"[intermittent chest tightness, discomfort from...",0.8,[[]],The extracted value accurately includes 'inter...
139,3339,symptoms,"[minimal exercise since winter started, forgot...",0.5,[[]],"The extracted symptoms include ""minimal exerci..."


In [15]:
# Function to remove hallucinations

def remove_hallucinations(evaluation_df, extractions):
    """
    Remove hallucinations from extractions based on evaluation results.
    """
    cleaned_extractions = []
    for extraction in extractions:
        medical_record_number = extraction["medical_record_number"]

        # Filter evaluation results for this medical record
        record_evaluations = evaluation_df[evaluation_df["medical_record_#"] == medical_record_number]

        # Iterate through each field in the evaluation results
        for _, row in record_evaluations.iterrows():
            field = row["field"]
            score = row["score"]

            # If hallucinations exist for this field, replace the value with "unknown"
            if score != 1.0:
                extraction[field] = "unknown"
        
        cleaned_extractions.append(extraction)

    return extractions

In [16]:
# Hallucination replacement and flagging

cleaned_extractions = remove_hallucinations(df_results, valid_extractions)
output_file_path = "cleaned_extractions.json"
save_json_objects(output_file_path, cleaned_extractions)

### Summary

This notebook takes the given clinical notes, extracts medical info into fields that I deem as useful and then evaluates the extractions. Below is a description of the files you'll find in this project and the process I used to create and verify these files.

- **Running**
  - If you'd like to run the notebook you can create a .env file and set API_KEY=XXXX to your openAPI key. For this exercise I utilized my own personal key.

- **Output Data**
  - clinical_data.csv: Downloaded raw data of clinical notes
  - error.csv: Contains fields that the evaluation gave a score of less than 1.0
  - extractions.json: Raw extractions from LLM without evaluation changes
  - cleaned_extraction.json: Cleaned extraction based off of evaluation method. Replaces unsure fields with unknown
  
- **LLM-Based Approach**: Chose an LLM for extraction and evaluation due to its superior accuracy and flexibility compared to NER or regex methods. I utilized langchain with prompt templates for efficient prompt creation.

- **Extraction Method**: Used a mass prompt and prompt template for extraction, including the clinical note, field specifications, and return types to ensure consistency. I utilized a quantitative and qualitative approach in evaluation. The eval will return a score of 0-1 with 1 being a perfect extraction as well as an explanation of why the score was given.
- **JSON Verification**: Before storing the extractions, I verify that first a json is returned from the LLM and then verify the fields within the json and replace any missing fields with unknown
- **Selected Fields**: These fields were chosen based on their clinical relevance and utility for doctors. Extracted key fields:  
  - `patient_name`  
  - `age`  
  - `gender`  
  - `medical_record_number`  
  - `symptoms`  
  - `diagnoses`  
  - `medications`  
  - `lab_results`  
  - `next_steps`  
- **Evaluation Method**: Employed a per-field evaluation prompt, providing the field name, extracted value, and clinical note for the LLM to verify accuracy.  
- **Hallucination Handling**: Removed explicit hallucinated fields with during evaluation and replaced them with unknown. An example of that is Thomas Wilson in the dataset. He has anxiety about the buffet options that was extracted as part of his symptoms but it isn't relevant. I decided to replace symptoms as unknown. Depending of the business need this maybe sufficient or with more time we'd want to analyze these extractions further.
- **Limitations**:  
  - Scalability: Per-field evaluation can become computationally expensive with large datasets.  
  - Token Limits: Large notes may exceed API token limits, requiring truncation or chunking. 
  - Cost: Doing a LLM approach for both extraction and evaluation can be costly at scale

- **Potential Next Steps**: With only 2 hours to conduct this pipeline e2e, there are some of ideas that come to mind I would like to try if I had more time. 
  - Other LLMs for evaluation(gpt4o-large and deepseek)
  - Establish a manual ground truth for analysis to improve evaluation
  - Combo approaches with NER to verify simple extracted entities.
  - Remove well extracted fields from eval prompt if it's doing really well to improve evaluation efficiency.

- **Total time taken**: 2 Hours 11 minutes
