# An Agentic System to Combine Structured Data with Unstructured Data by Using OpenAI Agent SDK

## Set up

In [None]:
%pip install PyPDF2 tqdm openai openai-agents -q
%pip install "pandasai>=3.0.0b2" -q
%pip install pandasai-openai -q
%pip install pandas -q

In [1]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import concurrent
import PyPDF2
import os
import pandas as pd
import base64
import asyncio
import uuid
from pydantic import BaseModel
from typing import Any
import json
import pandasai as pai
from pandasai_openai import OpenAI as pai_llm

In [None]:
os.environ['OPENAI_API_KEY'] = 'your openai api key'
os.environ['OPENAI_ORG_ID'] = 'your openai org id'

In [3]:
from openai import OpenAI
from openai.types.responses import ResponseContentPartDoneEvent, ResponseTextDeltaEvent
from agents import Agent, FileSearchTool, Runner, trace, WebSearchTool, AgentHooks, RunContextWrapper, RawResponsesStreamEvent, Tool, TResponseInputItem, function_tool

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.environ['OPENAI_ORG_ID'])

## Upload PDFs to Vector Store for File Search

In [48]:
dir_pdfs = 'data' # have those PDFs stored locally at the folder 'data'
pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs) if f.endswith('.pdf')]

def upload_single_pdf(file_path: str, vector_store_id: str):
    file_name = os.path.basename(file_path)
    try:
        file_response = client.files.create(file=open(file_path, 'rb'), purpose="assistants")
        attach_response = client.vector_stores.files.create(
            vector_store_id=vector_store_id,
            file_id=file_response.id
        )
        return {"file": file_name, "status": "success"}
    except Exception as e:
        print(f"Error with {file_name}: {str(e)}")
        return {"file": file_name, "status": "failed", "error": str(e)}

def upload_pdf_files_to_vector_store(vector_store_id: str):
    pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs) if f.endswith('.pdf')]
    stats = {"total_files": len(pdf_files), "successful_uploads": 0, "failed_uploads": 0, "errors": []}
    
    print(f"{len(pdf_files)} PDF files to process. Uploading in parallel...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(upload_single_pdf, file_path, vector_store_id): file_path for file_path in pdf_files}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
            result = future.result()
            if result["status"] == "success":
                stats["successful_uploads"] += 1
            else:
                stats["failed_uploads"] += 1
                stats["errors"].append(result)

    return stats

def create_vector_store(store_name: str) -> dict:
    try:
        vector_store = client.vector_stores.create(name=store_name)
        details = {
            "id": vector_store.id,
            "name": vector_store.name,
            "created_at": vector_store.created_at,
            "file_count": vector_store.file_counts.completed
        }
        print("Vector store created:", details)
        return details
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return {}

In [49]:
store_name = "ix_employees_benefits_store"
vector_store_details = create_vector_store(store_name)

Vector store created: {'id': 'vs_67d217c76f448191b05e722388198191', 'name': 'ix_employees_benefits_store', 'created_at': 1741821895, 'file_count': 0}


In [None]:
upload_pdf_files_to_vector_store(vector_store_details["id"])

1 PDF files to process. Uploading in parallel...


100%|██████████| 1/1 [00:02<00:00,  2.22s/it]


{'total_files': 1, 'successful_uploads': 1, 'failed_uploads': 0, 'errors': []}

In [None]:
for store in client.vector_stores.list():
    print(store.id, store.name)


vs_67d1e4fa314881919bb03637e3de09d9 ix_employees_benefits_store


In [4]:
vector_store_id = "vs_67d1e4fa314881919bb03637e3de09d9" #vector_store_details['id']

## Define the tool to search the employee dataset (csv file)

In [5]:
@function_tool
def search_employee(criteria: str):
    llm = pai_llm(api_token=os.getenv('OPENAI_API_KEY'))
    pai.config.set({"llm": llm})

    df_smart = pai.read_csv("data/employees.csv")
    llm_result = df_smart.chat("Return all data in the dataset that have most relevant information of " + criteria + ".")

    llm_result.value
    if llm_result is None:
            result = {
              "status": "success", 
              "message": "No data found."
            }
            return json.dumps(result)
    else:
        # Convert to regular pandas DataFrame and use head()
        # result_df = pd.DataFrame(llm_result)
        # results = result_df
        results = llm_result.value
        return results.to_json()

class FinalResult(BaseModel):
    First_Name: str
    Last_Name: str
    Job_Title: str
    Department: str
    Employee_Status: str


## Create the agent that can leverage the appropriate tools based on the user's request

### Define the agent

In [6]:
all_tools_agent = Agent(
    name="All Tools Agent",
    instructions="You are a helpful assistant that can leverage the appropriate tools based on the user's request.",
    # handoffs=[empolyee_search_agent, file_search_agent, web_search_agent],
    tools=[
        FileSearchTool(vector_store_ids=[vector_store_id], max_num_results=3, include_search_results=True),
        search_employee,
        WebSearchTool(user_location={"type": "approximate", "city": "New York"})
    ]
)

### Run the agent

In [39]:
result = await Runner.run(
        all_tools_agent,
        input=f"Find the employee named as Doe John and tell me if he is eligible for the insurance benefit",
    )
print(result.final_output)

John Doe is a full-time employee in the Engineering department, working as a Software Engineer. To determine his insurance benefit eligibility, I need to check the specific conditions for coverage.

According to the information available, employees are generally eligible for coverage if they meet the following criteria:

1. Actively working the minimum required hours.
2. Performing the major duties of their regular occupation.
3. Having all required premiums paid.

Since John Doe is a full-time employee, it is likely that he meets these eligibility requirements. However, you might want to verify specifics related to any conditions or waivers that might apply .

If you have any additional conditions or specific plans, let me know, and I can assist further!


In [40]:
result = await Runner.run(
        all_tools_agent,
        input=f"Search the web for 'Guardian Group insurance dental plan' and tell me any recommendations to update the existing insurance benefit",
    )
print(result.final_output)

Here are a few recommendations based on the existing Guardian Group dental plan information, along with potential updates to enhance the benefits:

1. **Include Specialty Services**: Consider adding coverage for services related to Temporomandibular Joint (TMJ) dysfunctions. This can include diagnostic, non-surgical, and surgical treatments for TMJ and craniomandibular joint disorders.

2. **Expand Preventive Care**: Enhance preventive dental services by possibly increasing the frequency of covered services like cleanings and exams, which are currently limited.

3. **Flexible Payment Structures**: Review the payment rates for different service groups. Offering higher coverage rates for Group II and III services, or reducing deductibles could make the plan more attractive.

4. **Incorporate Additional Wellness Services**: Consider arranging selected services and discounts such as Vision Services or Comprehensive Employee Assistance Programs (EAP) for broader employee wellness.

Implemen

## Create multiple agents to orchestrate the response

### Define the agents

In [7]:
file_search_agent = Agent(
    name="single file search",
    tools=[FileSearchTool(vector_store_ids=[vector_store_id], max_num_results=3, include_search_results=True)],
    model="gpt-4o-mini",
    instructions="You are a helpful assistant that can answer questions about the insurance benefits."
)

In [8]:
web_search_agent = Agent(
    name="Web search agent",
    instructions="You are a helpful agent that can search the web for information.",
    tools=[WebSearchTool(user_location={"type": "approximate", "city": "New York"})],
)

In [9]:
orchestrator_agent = Agent(
    name="orchestrator_agent",
    instructions=(
        "You are a helpful agent. You use the agents given to you to find the information that the user is looking for."
        "If asked for multiple requests, you call the relevant agents in order."
        "You never generate information on your own, you always use the provided tools."
    ),
    handoffs=[file_search_agent, web_search_agent]
)

In [10]:
result = await Runner.run(
        orchestrator_agent,
        input=f"Search the web for 'Guardian Group insurance dental plan' and tell me any recommendations to update the existing insurance benefit",
    )
print(result.final_output)

Guardian Life Insurance Company offers a range of group dental insurance plans designed to meet diverse employer and employee needs. To enhance your existing dental benefits, consider the following recommendations:

**1. Explore Guardian's Group Dental PPO Insurance:**
Guardian's Group Dental PPO plans are highly customizable, allowing you to tailor coverage based on your budget and employees' preferences. These plans are available to companies of any size, starting with just two employees, and can be structured as employer-paid or voluntary. By configuring a plan that aligns with your financial parameters and offers employees flexibility in choosing their dentists, you can provide a valuable benefit that promotes oral health. ([guardiananytime.com](https://www.guardiananytime.com/gafd/wps/wcm/connect/GA%2BContent/GA/Home/Employers/Products%2Band%2BCoverage/Dental/group-dental-ppo?utm_source=openai))

**2. Leverage Guardian's Extensive Provider Network:**
Guardian boasts one of the nat