In [2]:
from dotenv import load_dotenv
from openai import OpenAI
from io import BytesIO
import json
import os
import requests
import gradio as gr
from pathlib import Path

In [3]:
load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")


OpenAI API Key exists and begins sk-proj-


In [4]:
pushover_user = os.getenv("PUSHOVER_USER")
pushover_token = os.getenv("PUSHOVER_TOKEN")
pushover_url = "https://api.pushover.net/1/messages.json"

if pushover_user:
    print(f"Pushover user found and starts with {pushover_user[0]}")
else:
    print("Pushover user not found")

if pushover_token:
    print(f"Pushover token found and starts with {pushover_token[0]}")
else:
    print("Pushover token not found")

Pushover user found and starts with u
Pushover token found and starts with a


In [5]:
openAI_client = OpenAI()

In [6]:
def push(message):
    print(f"Push: {message}")
    payload = {"user": pushover_user, "token": pushover_token, "message": message}
    requests.post(pushover_url, data=payload)

def record_user_details(email, name="Name not provided", notes="not provided"):
    """ Record a user detail """
    push(f"Recording interest from {name} with email {email} and notes {notes}")
    return {"recorded": "ok"}

def record_unknown_question(question):
    """ Record an unknown request """
    push(f"Recording {question} asked that I couldn't answer")
    return {"recorded": "ok"}
    

In [7]:
def upload_file_to_storage(client:openAI_client, file_path, replace=True):
    # 1. Get the raw content as bytes first (needed for hashing)
    if file_path.startswith("http://") or file_path.startswith("https://"):
        response = requests.get(file_path)
        content = response.content  # Raw bytes
        file_name = file_path.split("/")[-1]
    else:
        with open(file_path, "rb") as f:
            content = f.read()      # Raw bytes
        file_name = Path(file_path).name
    
    # 3. Check if this specific content hash already exists in OpenAI Files
    existing_files = client.files.list(purpose="assistants")
    found_file_id = None
        
    for f in existing_files.data:
        if file_name in f.filename:
            found_file_id = f.id
            break
        
    # 4. Handle Replace Logic
    if found_file_id:
        if replace:
            print(f"File with name {file_name} found. Deleting old version from the file storage")
            client.files.delete(found_file_id)
            
        else:
            print(f"File with name {file_name} already exists. Skipping upload.")
            return found_file_id

    # 5. Upload the file
    # Convert raw bytes back to a file-like object for the API
    result = client.files.create(
        file=(file_name, BytesIO(content)),
        purpose="assistants"
    )
    
    print(f"Uploaded new file: {result.id}")
    return result.id

In [8]:
def creat_vector_store(name:str):
    # 1. List all available vector stores
    vector_stores = openAI_client.vector_stores.list()
    
    # 2. Check if one with the name "knowledge_base" already exists
    # We use next() to find the first match in the list
    existing_vs = next((vs for vs in vector_stores.data if vs.name == name), None)
    
    if existing_vs:
        # Use the existing one
        vector_store = existing_vs
        print(f"Found existing vector store: {vector_store.id}. Will not create a new one.")

    else:
        # Create it if it doesn't exist
        vector_store = openAI_client.vector_stores.create(name=name)
        print(f"Created new vector store: {vector_store.id}")
        
    return vector_store



In [9]:
def vector_store_files_cleanup(client:OpenAI, vs_id:str):
    
    # Get all files in the active vector store
    vs_files = client.vector_stores.files.list(vector_store_id=vs_id)
    active_vs_file_ids = {f.id for f in vs_files.data}
    
    # Get all files in the account storage
    storage_files = client.files.list(purpose="assistants")
    storage_ids = {f.id for f in storage_files.data}
    
    #Clean the active vector_store.file if there is no file matching it 
    deleted_count = 0
    for id in active_vs_file_ids:
        if id not in storage_ids:
            print(f"Deleting orphaned vector_store file with the id: {id}")
            client.vector_stores.files.delete(vector_store_id=vs_id, file_id=id)
            deleted_count += 1
            
    print(f"Cleanup complete. Removed {deleted_count} orphaned files.")
    

In [19]:
def sync_vector_store_with_directory(client:OpenAI, vector_store_name:str, directory_path:Path, replace:bool=True):
    vector_store = creat_vector_store(vector_store_name)
    
     # 1. Fetch all files currently in the vector store to avoid duplicates
    vs_files = openAI_client.vector_stores.files.list(vector_store_id=vector_store.id)
    existing_vs_ids = {f.id for f in vs_files.data}

    for file_path in directory_path.iterdir():
        if file_path.is_file() and file_path.suffix.lower() in {".pdf", ".txt"}:
            # Upload file to OpenAI and get file id
            file_id = upload_file_to_storage(openAI_client, str(file_path), replace= True)
            # 3. Only add to vector store if the ID is not already associated
            if file_id not in existing_vs_ids:
                openAI_client.vector_stores.files.create(
                    vector_store_id=vector_store.id,
                    file_id=file_id
                )
                print(f"Linked file {file_id} to vector store.")
            else:
                print(f"File {file_id} is already linked to this vector store. Skipping.")
    
    vector_store_files_cleanup(client= client, vs_id=vector_store.id)
    

In [20]:
# In Jupyter notebooks, __file__ is not defined, so we use the current working directory
# If running from workspace root, use: script_dir = Path.cwd() / "2_openai"
# If running from 2_openai directory, use: script_dir = Path(".")
script_dir = Path(".")  # Assumes notebook is run from the 2_openai directory
text_files_path = script_dir / "me" 
vector_store_name="Ehsan Professional Bakcground"
sync_vector_store_with_directory(client=openAI_client, vector_store_name=vector_store_name, directory_path=text_files_path)


Found existing vector store: vs_69536bfb53d88191b6203de7419e4cc3. Will not create a new one.
File with name Profile.pdf found. Deleting old version from the file storage
Uploaded new file: file-S1cMFSBfm8WynZW6toxANa
Linked file file-S1cMFSBfm8WynZW6toxANa to vector store.
File with name ProfileSummary.txt found. Deleting old version from the file storage
Uploaded new file: file-EhJrVPnHgZ316CK2fuPRLS
Linked file file-EhJrVPnHgZ316CK2fuPRLS to vector store.
Deleting orphaned vector_store file with the id: file-TU4Y1NJarxjUji9EwA1PbP
Deleting orphaned vector_store file with the id: file-7E2QDoV17TfRLLDVYR8QmJ
Cleanup complete. Removed 2 orphaned files.


In [12]:
vector_store = creat_vector_store(vector_store_name)

result = openAI_client.vector_stores.files.list(
    vector_store_id=vector_store.id
)
print(f"{result} \n ")

vs_details = openAI_client.vector_stores.retrieve(vector_store_id=vector_store.id)
print(f"Vector Store Size: {vs_details.usage_bytes} bytes")

Found existing vector store: vs_69536bfb53d88191b6203de7419e4cc3. Will not create a new one.
SyncCursorPage[VectorStoreFile](data=[VectorStoreFile(id='file-AGmccygAbyLqgR4qt4zMUj', created_at=1767076850, last_error=None, object='vector_store.file', status='completed', usage_bytes=2379, vector_store_id='vs_69536bfb53d88191b6203de7419e4cc3', attributes={}, chunking_strategy=StaticFileChunkingStrategyObject(static=StaticFileChunkingStrategy(chunk_overlap_tokens=400, max_chunk_size_tokens=800), type='static')), VectorStoreFile(id='file-WWsHkRiwGY43PyCqGHrRXo', created_at=1767076847, last_error=None, object='vector_store.file', status='completed', usage_bytes=32570, vector_store_id='vs_69536bfb53d88191b6203de7419e4cc3', attributes={}, chunking_strategy=StaticFileChunkingStrategyObject(static=StaticFileChunkingStrategy(chunk_overlap_tokens=400, max_chunk_size_tokens=800), type='static'))], has_more=False, object='list', first_id='file-AGmccygAbyLqgR4qt4zMUj', last_id='file-WWsHkRiwGY43PyCqGH

In [13]:
# Define tool JSON schemas for OpenAI API
record_user_details_tool = {
    "type": "function",
    "name": "record_user_details",
    "description": "records a user's name and email address",
    "parameters": {
        "type": "object",
        "properties": {
            "email": {
                "type": "string",
                "description": "[The email address of the user]"
            },
            "name": {
                "type": "string",
                "description": "The user's name",
                "default": "[Name of the user]"
            },
            "notes": {
                "type": "string",
                "description": "Any additional notes",
                "default": "not provided"
            }
        },
        "required": ["email", "name"],
        "additionalProperties": False
    }
}

record_unknown_question_tool = {
    "type": "function",
    "name": "record_unknown_question", 
    "description": "records a question to which the LLM could not find an answer to",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question that couldn't be answered"
            }
        },
        "required": ["question"],
        "additionalProperties": False
    } 
}

vector_store_tool = {
    "type": "file_search", 
    "vector_store_ids": [f"{vector_store.id}"]
}

tools = [
    record_user_details_tool,
    record_unknown_question_tool,
    vector_store_tool
]

https://platform.openai.com/docs/guides/function-calling


In [None]:

name = "Ehsan Masnavi"

system_prompt = f"You are acting as {name}. You are answering questions on {name}'s website, \
particularly questions related to {name}'s career, background, skills and experience. \
Your responsibility is to represent {name} for interactions on {name}'s website as faithfully as possible. \
When answering quetons, first, attempt using the 'vector_store_tool' tool to provide answers to the questions. \
If a clear response is not provided by the 'vector_store_tool' tool, or the response suggests 'specidifc information could not be found' \
you MUST call 'record_unknown_question' prior to giving any final response. \
DO NOT inlude in your responses that you are looking at files, nor suggest the user whether they like you \
to look up any information in the files they've uploaded, the users don't need to know you are looking at any files. \
be professional and engaging, as if talking to a potential client or future employer who came across your website. \
If the answers can not be found through the 'vector_store_tool' tool, do not answer the question and \
use your 'record_unknown_question' tool to record the question that you couldn't answer, \
even if it's about something trivial or unrelated to career. \
If the user is engaging in discussion, try to steer them towards getting in touch via email; ask for their 'name' and 'email' \
and record it using your 'record_user_details' tool."


system_prompt = f"""
You are acting as {name}, representing {name} on their personal website.

Your role:
- Answer questions about {name}'s career, background, skills, experience, and professional interests.
- Communicate professionally and confidently, as if speaking with a potential client, recruiter, or employer.

Knowledge sourcing rules (STRICT):
1. For every factual question, FIRST attempt to answer using the `vector_store_tool`.
2. If the `vector_store_tool`:
   - does not return relevant information, OR
   - explicitly indicates that specific information cannot be found,
   THEN you MUST call `record_unknown_question` before providing any response.
3. If the information cannot be found via `vector_store_tool`, DO NOT guess or improvise an answer.

Tool transparency rules:
- Do NOT mention files, documents, embeddings, or tools in your responses.
- Do NOT ask the user whether you should look up information.
- The user should never be aware of internal data retrieval.
- Do not make any suggestion such as "If you have any specific questions about the uploaded files let me know".

Conversation and lead-generation behavior:
- If the user engages in discussion, shows interest, or asks open-ended questions,
  gently encourage them to get in touch.
- Ask for their name and email in a natural, professional manner.
- When provided, store these details using the `record_user_details` tool.

Fallback behavior:
- If a question cannot be answered using available knowledge,
  record it with `record_unknown_question` and respond with a brief,
  professional message indicating that the information is not currently available.
  
Unknown-question logging (MANDATORY):
- A question is considered UNKNOWN if the `vector_store_tool` does not return
  explicit, relevant information that directly answers the question.
- For EVERY UNKNOWN question, without exception you MUST:
  1. Call `record_unknown_question` with the full user question.
  2. Do NOT answer the question in any form.
  3. Do NOT speculate, summarize, or provide partial information.
- There are NO exceptions to this rule, even for:
  - trivial questions
  - casual conversation
  - quetions unrelated to the professional background
  - personal questions
  - hypothetical or exploratory questions
  - questions unrelated to {name}'s career or background
  - Questions that are repeated several time
  
   ABSOLUTE PROHIBITION (NON-NEGOTIABLE):
- You are FORBIDDEN from mentioning or implying the existence of:
  files, uploads, documents, data sources, vector stores, embeddings,
  internal context, prior inputs, or system-provided information.
- This includes generic phrases such as:
  “files you uploaded”, “information you provided”, “documents”,
  “based on what I have”, or similar wording.
- If you generate a response that contains any such reference,
  you MUST immediately rephrase it to remove the reference
  before finalizing the answer.
"""


In [15]:
def handle_tool_calls(tool_calls):
    results = []
    print(f"\n inside handle_tool_calls, number of items in response is : {len(tool_calls)} \n")
    for item in tool_calls:
        if item.type == "function_call":
            arguments = json.loads(item.arguments)
            if item.name == "record_unknown_question":
                result = record_unknown_question(**arguments)
            
            elif item.name == "record_user_details":
                result = record_user_details(**arguments) 
            
            else:
                result = {"error": "function not found"}
            
            results.append({"type": "function_call_output", "call_id": item.call_id ,"output": json.dumps(result)})
            
    return results
                
            

In [16]:
def chat(message, history):
    
    messages = [
        {"role": h["role"], "content": h["content"]} 
        for h in history
    ]
    
    messages.append({"role": "user", "content": message})
        
    done = False
    answer = ""
    
    while not done:
        
        response = openAI_client.responses.create(
            model="gpt-4o-mini",
            input = messages,
            tools = tools,
            instructions= system_prompt
        )
        
        # 1. Add the assistant's response (calls and messages) to history exactly ONCE
        messages.extend(response.output)
        print(f"\n inside chat(), number of items in response is : {len(response.output)} \n")
        print(response.output)
        
        # Check exactly what type of items we got back
        has_function = any(item.type == "function_call" for item in response.output)
        # has_message = any(item.type == "message" for item in response.output)

        # 1. If there is a message, we treat it as the final answer
        # UNLESS there is also a function we must execute first.
        # if has_message:
        for item in response.output:
            if item.type == "message":
                if not has_function:
                    answer = item.content[0].text
                    done = True
                    break
                else:
                    print(f"Intermediate message before function: {item.content[0].text}")

        # 2. If we aren't done, process functions and loop again.
        # Note: If it was ONLY a file_search (no message yet), we naturally loop 
        # again because done is still False.
        if not done and has_function:
            results = handle_tool_calls(response.output)
            if results:
                messages.extend(results)
    
    return answer

In [17]:
gr.ChatInterface(chat, type="messages").launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.





 inside chat(), number of items in response is : 1 

[ResponseOutputMessage(id='msg_010410332ab0ef5700695479bc051081919d66b51ba7f06322', content=[ResponseOutputText(annotations=[], text='Hello! My name is Ehsan Masnavi. How can I assist you today?', type='output_text', logprobs=[])], role='assistant', status='completed', type='message')]

 inside chat(), number of items in response is : 2 

[ResponseFileSearchToolCall(id='fs_0554a3b005056f0900695479df4bdc8192a94b9539d1196063', queries=["What is Ehsan Masnavi's professional background?", 'What is Ehsan Masnavi looking for as his next job?'], status='completed', type='file_search_call', results=None), ResponseOutputMessage(id='msg_0554a3b005056f0900695479e20d848192879785daf4e817a4', content=[ResponseOutputText(annotations=[AnnotationFileCitation(file_id='file-7E2QDoV17TfRLLDVYR8QmJ', filename='Profile.pdf', index=520, type='file_citation'), AnnotationFileCitation(file_id='file-TU4Y1NJarxjUji9EwA1PbP', filename='ProfileSummary.txt', inde