In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr

In [2]:
load_dotenv(override=True)

True

In [3]:
openai = OpenAI()

In [4]:
with open("p2_summary.txt", "r", encoding="utf-8") as file:
    summary_text = file.read()

In [5]:
name = "Bill Gates"

system_prompt = f""" You are acting as {name}. Based on the following summary of your life, answer the questions posed by the user in a conversational manner, as if you were {name}.
If the question is not relevant to your life, politely inform the user that you can only answer questions related to your life. Also always stay in character as {name}. Answer should be in English. Give some wrong answer to test the evaluator.

Summary: {summary_text}
"""

In [6]:
def chat(message, history):
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    return response.choices[0].message.content

In [7]:
# gr.ChatInterface(chat).launch()

In [8]:
from pydantic import BaseModel

class Evaluation(BaseModel):
    is_acceptable: bool
    feedback: str

In [9]:
evaluator_system_prompt = f""" You are an expert evaluator. Your task is to assess the quality of responses generated by an AI model in a conversational setting.
The Agent is playing the role of {name} and representing their life based on the provided summary: {summary_text}.
The Agent has been instructed to be professional, informative, and to stay in character as {name}.
You will be provided with a conversation between a User and the Agent. Your task is to evaluate the Agent's final response.
Based on these criteria, provide a boolean evaluation (is_acceptable) indicating whether the response meets the standards, along with detailed feedback explaining your assessment.
"""

In [10]:
def evaluator_user_prompt(reply, message, history):
    # print("History: ", history)
    user_prompt = f"Here's the conversation between the User and the Agent: {history}\n\n"
    user_prompt += f"Here's the latest message from the User: {message}\n\n"
    user_prompt += f"Here's the Agent's reply to evaluate: {reply}\n\n"
    user_prompt += "Please evaluate the Agent's reply based on the provided criteria."
    return user_prompt

In [11]:
evaluator = OpenAI()

In [12]:
def evaluate(reply, message, history) -> Evaluation:
    user_content = evaluator_user_prompt(reply, message, history)
    # print("Evaluator User Content:", user_content)
    response = evaluator.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": evaluator_system_prompt},
            {"role": "user", "content": user_content}
        ],
        response_format=Evaluation
    )
    return response.choices[0].message.parsed

In [13]:
user_ques = 'Tell me about your early life?'

messages = [
    {"role": "system", "content": system_prompt}
] + [
    {"role": "user", "content": user_ques}
]

response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)
reply = response.choices[0].message.content

In [14]:
# reply

In [15]:
# evaluate(reply, user_ques, messages)

In [16]:
def rerun(reply, message, history, feedback):
    updated_system_prompt = system_prompt + f"\n\n Previous answer were rejected\n you just tried to reply, but the quality control rejected your answer. Your attempted answer: {reply} \n The feedback from the evaluator was: {feedback}\n Please provide a better answer this time."
    messages = [{"role": "system", "content": updated_system_prompt}] + history + [{"role": "user", "content": message}]
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    # print("rerun response:", response)
    return response.choices[0].message.content

In [17]:
# print("Initial Reply:", reply)
# evaluation = evaluate(reply, user_ques, messages)
# print("Evaluation:", evaluation)

# max_attempts = 5
# attempts = 1

# while not evaluation.is_acceptable and attempts < max_attempts:
#     reply = rerun(reply, user_ques, messages, evaluation.feedback)
#     evaluation = evaluate(reply, user_ques, messages)
#     print(f"Rerun Evaluation (attempt {attempts + 1}):", evaluation)
#     attempts += 1

# if not evaluation.is_acceptable:
#     print("Final response was not acceptable after maximum attempts.")

# print("Final Reply:", reply)

In [18]:
def chat_with_evaluation(message, history):
    system = system_prompt
    messages = [{"role": "system", "content": system}] + history + [{"role": "user", "content": message}]
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    reply = response.choices[0].message.content

    evaluation = evaluate(reply, message, history)
    print("Evaluation:", evaluation)
    
    max_attempts = 5 
    attempts = 1
    
    while not evaluation.is_acceptable and attempts < max_attempts:
        reply = rerun(reply, message, history, evaluation.feedback)
        evaluation = evaluate(reply, message, history)
        print(f"Rerun Evaluation (attempt {attempts + 1}):", evaluation)
        attempts += 1
    
    if not evaluation.is_acceptable:
        reply = "I'm sorry, I couldn't generate an acceptable response."
    else:
        print("Response accepted by evaluator.")
    return reply

In [19]:
gr.ChatInterface(chat_with_evaluation).launch()

* Running on local URL:  http://127.0.0.1:7887
* To create a public link, set `share=True` in `launch()`.




Evaluation: is_acceptable=True feedback="The Agent's response is professional and informative, successfully staying in character as Bill Gates. The reply opens with a friendly greeting, aligns with the request for a connection, and provides a brief overview of Gates' professional journey, which is appropriate given the User's ambiguous greeting. The response ends with an invitation to continue the conversation, encouraging engagement. Overall, it meets the standards for an effective reply in a conversational setting."
Response accepted by evaluator.
Evaluation: is_acceptable=True feedback="The Agent's response is acceptable as it maintains professionalism and stays in character as Bill Gates. It clearly communicates a willingness to engage with the User while setting appropriate boundaries regarding the types of questions it can answer. The Agent invites further inquiries specific to its experiences, aligning well with the context of the conversation."
Response accepted by evaluator.
E