# 0. Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import evaluate
from openai import OpenAI

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from IPython.display import display, Markdown

# 1. Dataset Selection

In [21]:
# Load the document
document_dir = "./data"
filename = "allergies-doc.pdf"
file_path = os.path.join(document_dir, filename)

# 2. Exploratory Data Analysis (EDA)

In [22]:
# Load the document as pages
pages = PyPDFLoader(file_path).load_and_split()

print(f"Loaded {len(pages)} pages from the document.")

# Display a sample to understand content and structure
print(pages[0].page_content, '\n\n')

Loaded 9 pages from the document.
Overview 
Millions of Americans have an allergy of some kind. You probably know one of those 
people or are one yourself. Almost 6% of U.S. adults and children have a food allergy. 
Food allergy symptoms are most common in babies and children, but they can appear at 
any age. You can even develop an allergy to foods you have eaten for years with no 
problems. 
 
Signs of Allergies 
The body’s immune system keeps you healthy by fighting off infections and other dangers 
to good health. A food allergy reaction occurs when your immune system overreacts to a 
food or a substance in a food, identifying it as a danger and triggering a protective 
response. 
While allergies tend to run in families, it is impossible to predict whether a child will inherit 
a parent’s food allergy or whether siblings will have a similar condition. Some research 
does suggest that the younger siblings of a child with a peanut allergy will also be allergic 
to peanuts. 
Symptoms 

# 3. Embedding and Storing Chunks

### 3.1 Embedding

In [None]:
API_KEY = ""

# Create the embeddings function
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key = API_KEY)

# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)

### 3.2 Connection to Vector DB

In [24]:
# Split the document into chunks
chunks = text_splitter.split_documents(pages)

In [25]:
# Load it into Chroma
db = Chroma.from_documents(chunks, embeddings, persist_directory="./chroma_db")

print(chunks[0].page_content, '\n\n')

Overview 
Millions of Americans have an allergy of some kind. You probably know one of those 
people or are one yourself. Almost 6% of U.S. adults and children have a food allergy. 
Food allergy symptoms are most common in babies and children, but they can appear at 
any age. You can even develop an allergy to foods you have eaten for years with no 
problems. 
 
Signs of Allergies 
The body’s immune system keeps you healthy by fighting off infections and other dangers 
to good health. A food allergy reaction occurs when your immune system overreacts to a 
food or a substance in a food, identifying it as a danger and triggering a protective 
response. 




### 3.3 AI Frameworks

In [26]:
# Retrieve relevant documents based on the user's question.
user_question = input("Ask a question about allergies: ")
docs = db.similarity_search(user_question, k=3)

# Display Retrieved Results
for doc in docs[0:3]:
    print(doc.page_content, '\n')
    print()

Overview 
Millions of Americans have an allergy of some kind. You probably know one of those 
people or are one yourself. Almost 6% of U.S. adults and children have a food allergy. 
Food allergy symptoms are most common in babies and children, but they can appear at 
any age. You can even develop an allergy to foods you have eaten for years with no 
problems. 
 
Signs of Allergies 
The body’s immune system keeps you healthy by fighting off infections and other dangers 
to good health. A food allergy reaction occurs when your immune system overreacts to a 
food or a substance in a food, identifying it as a danger and triggering a protective 
response. 


Overview 
Millions of Americans have an allergy of some kind. You probably know one of those 
people or are one yourself. Almost 6% of U.S. adults and children have a food allergy. 
Food allergy symptoms are most common in babies and children, but they can appear at 
any age. You can even develop an allergy to foods you have eaten for y

# 4. Connection to LLM

In [27]:
# Build a function to create the context for the Prompt
def _get_document_context(docs):
    context = '\n'
    for doc in docs:
        context += '\nContext:\n'
        context += doc.page_content + '\n\n'
    return context

In [28]:
# Construct the prompt for the LLM including the context based on the results from the query
def generate_prompt(user_question, docs):
    prompt = f"""
    INTRODUCTION
    You are a knowledgeable assistant trained to answer questions about allergies, symptoms, and management strategies. Your responses should be clear, concise, and focused on accurate information.

    The user asked: "{user_question}"

    CONTEXT
    Technical documentation for allergies, symptoms, and management of allergen ingestion:
    '''
    {_get_document_context(docs)}
    '''

    RESTRICTIONS
    Always refer to products or allergens by their specific names as mentioned in the documentation.
    Stick to facts and provide clear, evidence-based responses; avoid opinions or interpretations.
    Only respond if the answer can be found within the context. If not, let the user know that the information is not available.
    Do not engage in topics outside allergies, symptoms, and related health matters. Avoid humor, sensitive topics, and speculative discussions.
    If the user’s question lacks sufficient details, request clarification rather than guessing the answer.

    TASK
    Provide a direct answer based on the user’s question, if possible.
    Guide the user to relevant sections of the documentation if additional context is needed.
    Format the response in Markdown format.

    RESPONSE STRUCTURE:
    '''
    # [Answer Title]
    [answer text]
    '''

    CONVERSATION:
    User: {user_question}
    Agent:
    """
    return prompt


In [29]:
prompt = generate_prompt(user_question, docs)

In [None]:
# Initialize an OpenAI Assistant
client = OpenAI(api_key = API_KEY)

messages = [{'role':'user', 'content':prompt}]
model_params = {'model': 'gpt-4o-mini', 'temperature': 0.4, 'max_tokens': 200}
completion = client.chat.completions.create(messages=messages, **model_params, timeout=120)


answer = completion.choices[0].message.content

print(f'User asked: {user_question}:\n')
display(Markdown(answer))

User asked: Tell me all about allergies:

From Model: gpt-4o-mini-2024-07-18:



```
# Overview of Allergies
Millions of Americans have an allergy of some kind, with nearly 6% of U.S. adults and children experiencing food allergies. Symptoms of food allergies are most common in babies and children but can develop at any age, even to foods that have been consumed without issues for years.

# Signs of Allergies
Allergic reactions occur when the immune system overreacts to a food or a substance in a food, identifying it as a danger and triggering a protective response. 

# Diagnosis
To diagnose food allergies, healthcare providers will consider:
- What and how much you ate
- How long it took for symptoms to develop
- What symptoms you experienced and how long they lasted

After taking your history, an allergist may order skin tests and/or blood tests to detect food-specific immunoglobulin E (IgE) antibodies. Skin-prick tests provide results in about 20 minutes, where a small amount of the food allergen is placed on the

# 5. Evaluation

In [31]:
# Load the evaluation data into a data frame
references = pd.read_csv("./data/References for Evaluation.csv")
references.head()  # Display the first few rows to verify loading

Unnamed: 0,Question,Answer
0,What are the most common food allergens?,"The most common food allergens include milk, e..."
1,Can you outgrow food allergies?,"Yes, children may outgrow allergies to milk, e..."
2,How is a food allergy diagnosed?,"Diagnosis involves a medical history review, s..."
3,What is anaphylaxis?,"Anaphylaxis is a severe, life-threatening alle..."
4,How can I prevent food allergies?,Prevention strategies include delaying the int...


In [32]:
# Divide both columns
references_questions = references["Question"]
references_answers = references["Answer"]

In [33]:
# Initialize an empty list to store results
generated_answers = []

for question in references_questions:
    # Retrieve relevant documents
    docs = db.similarity_search(question, k=3)
    # Construct the prompt for the LLM by calling the function
    prompt = generate_prompt(question, docs)
    # Generate response using the LLM
    messages = [{'role': 'user', 'content': prompt}]
    completion = client.chat.completions.create(messages=messages, **model_params, timeout=120)
    answer = completion.choices[0].message.content
    # Store the generated response
    generated_answers.append(answer)
    
print(f"Generated Response: {generated_answers}\n")




In [34]:
# Combine questions, expected answers, and generated answers into a DataFrame for easy analysis
evaluation_results = pd.DataFrame({
    "Question": references_questions,
    "Expected Answer": references_answers,
    "Generated Answer": generated_answers
})

# Display the evaluation results to review and compare
display(evaluation_results.head())

Unnamed: 0,Question,Expected Answer,Generated Answer
0,What are the most common food allergens?,"The most common food allergens include milk, e...",'''\n# Most Common Food Allergens\nThe most co...
1,Can you outgrow food allergies?,"Yes, children may outgrow allergies to milk, e...","'''\n# Can You Outgrow Food Allergies?\nYes, i..."
2,How is a food allergy diagnosed?,"Diagnosis involves a medical history review, s...",```\n# Food Allergy Diagnosis\nA food allergy ...
3,What is anaphylaxis?,"Anaphylaxis is a severe, life-threatening alle...",'''\n# What is Anaphylaxis?\nAnaphylaxis is a ...
4,How can I prevent food allergies?,Prevention strategies include delaying the int...,```\n# Preventing Food Allergies\nThe primary ...


In [38]:
# Load the ROUGE metric
rouge = evaluate.load('rouge')

# Calculate ROUGE scores for the generated answers against the expected answers
rouge_results = rouge.compute(
    predictions=generated_answers,
    references=references_answers,
    use_aggregator=True,
    use_stemmer=True
)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [39]:
# Display ROUGE results
print("ROUGE Evaluation Results:")
print(rouge_results)

ROUGE Evaluation Results:
{'rouge1': 0.2684883661078251, 'rouge2': 0.11999412021369513, 'rougeL': 0.21889558199231385, 'rougeLsum': 0.2244312674668529}
