In [1]:
%pip install PyPDF2 pandas tqdm openai -q

Note: you may need to restart the kernel to use updated packages.


In [20]:
%pip install --upgrade openai

Note: you may need to restart the kernel to use updated packages.


In [12]:
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import concurrent
import PyPDF2
import os
import pandas as pd
from dotenv import load_dotenv
import gradio as gr
from utils.api_key_checker import APIKeyChecker

In [13]:
# Create an instance of APIKeyChecker
checker = APIKeyChecker()

# Call the check_keys method to print the API key status
checker.check_keys()



OpenAI API Key exists and begins sk-proj-
XAI API Key exists and begins xai-fDAl


In [14]:
openai_api_key = checker.get_openai_api_key()

In [15]:
client = OpenAI(api_key=openai_api_key)

dir_pdfs = 'mediterranean_noir_pdfs' # have those PDFs stored locally here
pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]

print(f"{len(pdf_files)} PDF files to process")

13 PDF files to process


In [16]:
def upload_single_pdf(file_path: str, vector_store_id: str):
    file_name = os.path.basename(file_path)
    try:
        file_response = client.files.create(file=open(file_path, 'rb'), purpose="assistants")
        attach_response = client.vector_stores.files.create(
            vector_store_id=vector_store_id,
            file_id=file_response.id
        )
        return {"file": file_name, "status": "success"}
    except Exception as e:
        print(f"Error with {file_name}: {str(e)}")
        return {"file": file_name, "status": "failed", "error": str(e)}

def upload_pdf_files_to_vector_store(vector_store_id: str):
    pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]
    stats = {"total_files": len(pdf_files), "successful_uploads": 0, "failed_uploads": 0, "errors": []}
    
    print(f"{len(pdf_files)} PDF files to process. Uploading in parallel...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(upload_single_pdf, file_path, vector_store_id): file_path for file_path in pdf_files}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
            result = future.result()
            if result["status"] == "success":
                stats["successful_uploads"] += 1
            else:
                stats["failed_uploads"] += 1
                stats["errors"].append(result)

    return stats

def create_vector_store(store_name: str) -> dict:
    try:
        vector_store = client.vector_stores.create(name=store_name)
        details = {
            "id": vector_store.id,
            "name": vector_store.name,
            "created_at": vector_store.created_at,
            "file_count": vector_store.file_counts.completed
        }
        print("Vector store created:", details)
        return details
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return {}

In [17]:
store_name = "mediterranean_noir_store"
vector_store_details = create_vector_store(store_name)
upload_pdf_files_to_vector_store(vector_store_details["id"])

Vector store created: {'id': 'vs_67db26311d4c8191a5f1caccd8e983b9', 'name': 'mediterranean_noir_store', 'created_at': 1742415409, 'file_count': 0}
13 PDF files to process. Uploading in parallel...


100%|██████████| 13/13 [00:03<00:00,  3.26it/s]


{'total_files': 13,
 'successful_uploads': 13,
 'failed_uploads': 0,
 'errors': []}

In [22]:
def chat(message):
    query = message
    response = client.responses.create(
        input= query,
        instructions="Answer the question with as much detail as you can.",
        model="gpt-4o-mini",
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_details['id']],
        }]
    )
    print(response.output)

    if len(response.output) == 1:
        print(response)
        return response.output[0].content[0].text
    

    # Extract annotations from the response
    annotations = response.output[1].content[0].annotations
    
    # # Get top-k retrieved filenames
    retrieved_files = set([result.filename for result in annotations])

    answer = f'Files used: {retrieved_files}' + '\n' + 'Response:' + response.output[1].content[0].text

    return answer





In [None]:
gr.Interface(fn=chat, inputs="textbox", outputs="textbox").launch()

* Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.




[ResponseFileSearchToolCall(id='fs_67db2f3d68c48190aa58d582fa0a8078024f8bb83c4b7568', queries=['Alligator series'], status='completed', type='file_search_call', results=None), ResponseOutputMessage(id='msg_67db2f3ebbec81908f63e7cf6953cdf6024f8bb83c4b7568', content=[ResponseOutputText(annotations=[AnnotationFileCitation(file_id='file-3Z5i51FdQn3usTSnA7b6H5', index=831, type='file_citation', filename='Chapter 5: Characters in Mediterranean Noir.pdf'), AnnotationFileCitation(file_id='file-3Z5i51FdQn3usTSnA7b6H5', index=831, type='file_citation', filename='Chapter 5: Characters in Mediterranean Noir.pdf'), AnnotationFileCitation(file_id='file-XJQ9rRdFJ5LdaHap1zT6JA', index=1100, type='file_citation', filename='Chapter 2: Common Themes and Motifs.pdf'), AnnotationFileCitation(file_id='file-3Z5i51FdQn3usTSnA7b6H5', index=1100, type='file_citation', filename='Chapter 5: Characters in Mediterranean Noir.pdf'), AnnotationFileCitation(file_id='file-3Z5i51FdQn3usTSnA7b6H5', index=1272, type='file