#### Set Up

In [1]:
#!pip install PyPDF2 pandas tqdm openai -q

In [1]:
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import concurrent
import PyPDF2
import os
import pandas as pd
import base64

## export OPENAI_API_KEY='XXXX'
## echo $OPENAI_API_KEY
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
dir_pdfs = 'data' # have those PDFs stored locally here
pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]

In [2]:
def upload_single_pdf(file_path: str, vector_store_id: str):
    file_name = os.path.basename(file_path)
    try:
        file_response = client.files.create(file=open(file_path, 'rb'), purpose="assistants")
        attach_response = client.vector_stores.files.create(
            vector_store_id=vector_store_id,
            file_id=file_response.id
        )
        return {"file": file_name, "status": "success"}
    except Exception as e:
        print(f"Error with {file_name}: {str(e)}")
        return {"file": file_name, "status": "failed", "error": str(e)}

def upload_pdf_files_to_vector_store(vector_store_id: str):
    pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]
    stats = {"total_files": len(pdf_files), "successful_uploads": 0, "failed_uploads": 0, "errors": []}
    
    print(f"{len(pdf_files)} PDF files to process. Uploading in parallel...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(upload_single_pdf, file_path, vector_store_id): file_path for file_path in pdf_files}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
            result = future.result()
            if result["status"] == "success":
                stats["successful_uploads"] += 1
            else:
                stats["failed_uploads"] += 1
                stats["errors"].append(result)

    return stats

def create_vector_store(store_name: str) -> dict:
    try:
        vector_store = client.vector_stores.create(name=store_name)
        details = {
            "id": vector_store.id,
            "name": vector_store.name,
            "created_at": vector_store.created_at,
            "file_count": vector_store.file_counts.completed
        }
        print("Vector store created:", details)
        return details
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return {}

In [3]:
store_name = "openai_blog_store"
vector_store_details = create_vector_store(store_name)
upload_pdf_files_to_vector_store(vector_store_details["id"])

Vector store created: {'id': 'vs_67f8b58901c48191984624dbc7ee9dd0', 'name': 'openai_blog_store', 'created_at': 1744352649, 'file_count': 0}
1 PDF files to process. Uploading in parallel...


100%|██████████| 1/1 [00:43<00:00, 43.62s/it]


{'total_files': 1, 'successful_uploads': 1, 'failed_uploads': 0, 'errors': []}

In [4]:
query = "What's is the three test algorithm?"
search_results = client.vector_stores.search(
    vector_store_id=vector_store_details['id'],
    query=query
)

In [5]:
for result in search_results.data:
    print(str(len(result.content[0].text)) + ' of character of content from ' + result.filename + ' with a relevant score of ' + str(result.score))

3243 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.7112310534384182
2734 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.5440879896364054
3135 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.46890893488627894
3237 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.4614953523608751
3418 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.4483077232768376
3027 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.41732623324363355
3026 of character of content from Kenya HIV Prevention and Treatment Guidelines, 2022.pdf with a relevant score of 0.36586905328085234
3266 of character of content from Kenya HIV Prevention and 

In [6]:
query = "My client has just been diagnosed with HIV. She is pregnant, has elevated liver markers and has TB. How should I manage her?"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

# Extract annotations from the response
annotations = response.output[1].content[0].annotations
    
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
print(response.output[1].content[0].text) # 0 being the filesearch call

Files used: {'Kenya HIV Prevention and Treatment Guidelines, 2022.pdf'}
Response:
Managing a pregnant woman who has been diagnosed with HIV, elevated liver markers, and tuberculosis (TB) involves a multidisciplinary approach that ensures the health of both the mother and the baby. Here are the recommended management steps based on current guidelines:

### 1. **Initiate Antiretroviral Therapy (ART)**
- **Start ART**: Initiate lifelong ART for all pregnant women living with HIV, irrespective of their CD4 count or any clinical stage. The preferred first-line regimen is **TDF + 3TC + DTG** (Tenofovir + Lamivudine + Dolutegravir).
- **Timing**: Ideally, initiate ART on the same day as the HIV diagnosis.

### 2. **Monitor Liver Function**
- Given the elevated liver markers, closer monitoring of liver function tests (e.g., ALT) is critical.
- Liver function should be assessed regularly as the co-infection with TB and potential hepatotoxicity from ART necessitates vigilance.

### 3. **Manage T

#### Evaluating performance

In [7]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text



def generate_multiple_questions(pdf_path, num_questions=3, tokens_per_chunk=6000):
    """Generate multiple questions from different parts of a PDF"""
    questions = []
    
    # Extract text from different sections of the PDF
    full_text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return questions
    
    # Divide document into sections
    text_length = len(full_text)
    chars_per_section = text_length // num_questions
    
    # Generate a question from each section
    for i in range(num_questions):
        start = i * chars_per_section
        end = min(start + (tokens_per_chunk * 4), text_length)
        section_text = full_text[start:end]
        
        prompt = (
            f"This is section {i+1} of {num_questions} from a document. "
            "Can you generate a question that can only be answered from this section?:\n"
            f"{section_text}\n\n"
        )
        
        try:
            response = client.responses.create(
                input=prompt,
                model="gpt-4o",
            )
            question = response.output[0].content[0].text
            questions.append(question)
        except Exception as e:
            print(f"Error generating question for section {i+1}: {e}")
    
    return questions

In [8]:
generate_multiple_questions(pdf_files[0])

["What is the main theme of the 2022 edition of the Kenya HIV Prevention and Treatment Guidelines as stated in the document's foreword?",
 'What is the recommended starting dose of atorvastatin for patients not on a PI/r, and how long should you wait before repeating fasting lipids to titrate the dose?',
 'What diagnostic steps should be followed for a patient who tests positive on smear microscopy but has a negative GeneXpert MTB result?']

In [9]:
# Generate questions for each PDF and store in a dictionary
questions_dict = {}
for pdf_path in pdf_files:
    questions = generate_multiple_questions(pdf_path)
    questions_dict[os.path.basename(pdf_path)] = questions

In [10]:
questions_dict

{'Kenya HIV Prevention and Treatment Guidelines, 2022.pdf': ['What is the theme of the Kenya HIV Prevention and Treatment Guidelines, 2022, according to the foreword?',
  'What is the advised approach if a patient with diabetes does not meet treatment targets with metformin for 3-6 months at the maximum tolerated dose?',
  'What are the considerations and actions involved in managing a presumptive TB case when both the GeneXpert and smear microscopy are unavailable on site?']}

In [21]:
rows = []
for filename, query in questions_dict.items():
    rows.append({"query": query, "_id": filename.replace(".pdf", "")})

# Metrics evaluation parameters
k = 5
total_queries = len(rows)
correct_retrievals_at_k = 0
reciprocal_ranks = []
average_precisions = []

def process_query(row):
    query_list = row['query']  # This is a list of strings
    if isinstance(query_list, list):
        query = query_list[0]  # Let's use the first question for testing
    else:
        query = query_list  # In case it's already a string
        
    expected_filename = row['_id'] + '.pdf'
    
    # Call file_search via Responses API
    response = client.responses.create(
        input=[{
            "role": "user", 
            "content": [{"type": "input_text", "text": query}]  # Now query is a string
        }],
        model="gpt-4o-mini",
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_details['id']],
            "max_num_results": k,
        }],
        tool_choice="required"
    )
    # Extract annotations from the response
    annotations = None
    if hasattr(response.output[1], 'content') and response.output[1].content:
        annotations = response.output[1].content[0].annotations
    elif hasattr(response.output[1], 'annotations'):
        annotations = response.output[1].annotations

    if annotations is None:
        print(f"No annotations for query: {query}")
        return False, 0, 0

    # Get top-k retrieved filenames
    retrieved_files = [result.filename for result in annotations[:k]]
    if expected_filename in retrieved_files:
        rank = retrieved_files.index(expected_filename) + 1
        rr = 1 / rank
        correct = True
    else:
        rr = 0
        correct = False

    # Calculate Average Precision
    precisions = []
    num_relevant = 0
    for i, fname in enumerate(retrieved_files):
        if fname == expected_filename:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    
    if expected_filename not in retrieved_files:
        print("Expected file NOT found in the retrieved files!")
        
    if retrieved_files and retrieved_files[0] != expected_filename:
        print(f"Query: {query}")
        print(f"Expected file: {expected_filename}")
        print(f"First retrieved file: {retrieved_files[0]}")
        print(f"Retrieved files: {retrieved_files}")
        print("-" * 50)
    
    
    return correct, rr, avg_precision

In [22]:
process_query(rows[0])

(True, 1.0, 1.0)

In [23]:
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_query, rows), total=total_queries))

correct_retrievals_at_k = 0
reciprocal_ranks = []
average_precisions = []

for correct, rr, avg_precision in results:
    if correct:
        correct_retrievals_at_k += 1
    reciprocal_ranks.append(rr)
    average_precisions.append(avg_precision)

recall_at_k = correct_retrievals_at_k / total_queries
precision_at_k = recall_at_k  # In this context, same as recall
mrr = sum(reciprocal_ranks) / total_queries
map_score = sum(average_precisions) / total_queries

100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


In [24]:
# Print the metrics with k
print(f"Metrics at k={k}:")
print(f"Recall@{k}: {recall_at_k:.4f}")
print(f"Precision@{k}: {precision_at_k:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")

Metrics at k=5:
Recall@5: 1.0000
Precision@5: 1.0000
Mean Reciprocal Rank (MRR): 1.0000
Mean Average Precision (MAP): 1.0000
