# AI Assignment Agent

This script is designed to evaluate the performance of a generative AI model in answering questions accurately based on provided data. It utilizes the Google Gemini And OpenAI ChatGPT API's to generate answers to a set of questions derived from a JSON file containing training data. The script processes each question, formats it according to specific instructions, and then compares the AI-generated answers to the actual answers within a defined tolerance level to determine correctness.

Key Features:
- Loads training data from a JSON file and iterates through a specified number of questions.
- Utilizes Google Gemini and OpenAI API's for generating answers.
- Formats the input for the AI model according to detailed instructions, ensuring the model receives all necessary context.
- Compares the AI-generated answers to the actual answers, taking into account a tolerance for numerical comparison.
- Updates and displays a plot in real-time to visualize the number of correct answers versus the number of questions asked using Plotly.

Requirements:
- Google Generative AI and Open AI libraries
- Set your own API Keys as system variables
- Plotly for visualization
- JSON for data handling
- OS, fpdf  and re (regular expression) modules for file and string operations

The script aims to provide a quantitative assessment of the AI model's accuracy in a controlled experimental setup, with potential applications in evaluating AI models for research assistance or data analysis tasks.
It will then auto generate a report which analyzes and compares the accuracies of the two models. The results for each model performance are detailed separately from the report for you to verify the reports accuracy.

In [3]:
# Import necessary libraries
import os
import json
import re
from fpdf import FPDF
import google.generativeai as genai
from openai import OpenAI
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.graph_objs import FigureWidget


# LLM API KEY Configuration and Initialization
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
model_gemini = genai.GenerativeModel('gemini-1.5-flash')

client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))
model_openai="gpt-4o"

# Load training data
file_path = os.path.join(os.getcwd(), 'Data/train.json')
if os.path.exists(file_path):
    with open(file_path) as file:
        data = json.load(file)

# Variables to controle quetion number and tolerance for comparison of LLM response with actual answers
Questions_to_answer = 81 # Number of pre-text sections (some sections may have multiple questions)
tolerance = 0.04 # accounts for rounding differnce in LLM response

# Load the prompt for LLM ansering quetions from data set
with open(os.path.join(os.getcwd(), 'answer_prompt.txt'), 'r') as file:
    prompt = file.read()
    
# Load the prompt for LLM gerating a report for the results of the LLMs answering questions
with open(os.path.join(os.getcwd(), 'report_prompt.txt'), 'r') as file:
    report_prompt = file.read()


In [None]:
##################################################  Google Gemini Reponse ##########################################################
#Key Performance Indicators
google_quetion_nunber_tracker= []
google_correct_answer_questions = []
google_wrong_answer_questions = []
question_details = []

# Initialize visualization with whole numbers on the axes
fig = FigureWidget(make_subplots(rows=1, cols=1))
fig.add_trace(go.Scatter(y=[], mode='lines+markers'), row=1, col=1)
fig.update_layout(title="Number of Correct Answers Vs Questions Asked",
                  xaxis_title="Attempt",
                  yaxis_title="Number of Correct Answers")
display(fig)  # Use display() for Jupyter Notebooks

def update_plot(correct_answers):
    """Updates the plot with the latest number of correct answers."""
    google_quetion_nunber_tracker.append(correct_answers)
    x_values = list(range(1, len(google_quetion_nunber_tracker) + 1))
    fig.data[0].x = x_values
    fig.data[0].y = google_quetion_nunber_tracker

# Main loop for processing questions with LLM
for question_number in range(Questions_to_answer):
  
    # Extract keys from the data for each question
    keys = list(data[question_number].keys())
    input_message = "pre_text = " + str(data[question_number]['pre_text']) + \
                    "\npost_text = " + str(data[question_number]['post_text']) + \
                    "\ntable_ori = " + str(data[question_number]['table_ori']) + \
                    "\ntable = " + str(data[question_number]['table'])

    # Construct input message and collect answers for varying number of quetions for each section
    question_answers = ""
    if 'qa' in keys:
        input_message += "\nqa = " + str(data[question_number]['qa']['question'])
        question_answers = str(data[question_number]['qa']['answer']) + ", "
    else:
        max_index = max([int(key.split('_')[1]) for key in keys if key.startswith('qa')])
        for i in range(max_index + 1):
            question_key = f'qa_{i}'
            if question_key in data[question_number]:
                input_message += "\n" + str(question_key) + " = " + str(data[question_number][question_key]['question'])
                question_answers += str(data[question_number][question_key]['answer']) + ", "
                question_details.append("QUESTION NUMBER INFORMATION" + str(question_number) + "\n" + input_message + "\nCorrect Answer" + str(data[question_number][question_key]['answer']) + "\n")

  # Google Gemini query and collect response
    assistant_response = str(model_gemini.generate_content([prompt, input_message]).text)

    question_answers_list = [x.strip() for x in question_answers.split(',') if x.strip()]
    assistant_response_list = [x.strip() for x in assistant_response.split(',') if x.strip()]

    # Compare each answer within the tolerance and update results plot
    for actual, assistant in zip(question_answers_list, assistant_response_list):
        actual_value = float(re.search(r"-?\d+(\.\d+)?", actual).group()) if re.search(r"-?\d+(\.\d+)?", actual) else None
        assistant_value = float(re.search(r"-?\d+(\.\d+)?", assistant).group()) if re.search(r"-?\d+(\.\d+)?", assistant) else None

        # Checks if quetion number has appeard befor and if so adds a suffix to the quetion number 
        base_question_str = str(question_number + 1)
        if actual_value is not None and assistant_value is not None:
            if abs(actual_value - assistant_value) / abs(actual_value) <= tolerance:
                if base_question_str in google_correct_answer_questions:
                    suffix = chr(97 + google_correct_answer_questions.count(base_question_str))  # Start from 'b'
                    google_correct_answer_questions.append(f"{base_question_str}{suffix}")
                else:
                    google_correct_answer_questions.append(base_question_str)
            else:
                if base_question_str in google_wrong_answer_questions:
                    suffix = chr(97 + google_wrong_answer_questions.count(base_question_str))  # Start from 'b'
                    google_wrong_answer_questions.append(f"{base_question_str}{suffix}")
                else:
                    google_wrong_answer_questions.append(base_question_str)
            update_plot(len(google_correct_answer_questions))         

# Summarise final results to check LLM report results
print("Total correct answers: " + str(len(google_correct_answer_questions)))
print("Correct answer questions: " + str(google_correct_answer_questions))
print("Total wrong answers: " + str(len(google_wrong_answer_questions)))
print("Wrong answer questions: " + str(google_wrong_answer_questions))

In [None]:
##################################################  ApenAI ChatGPT Reponse ##########################################################
#Key Performance Indicators
openai_quetion_nunber_tracker= []
openai_correct_answer_questions = []
openai_wrong_answer_questions = []
question_details = []

# Initialize visualization with whole numbers on the axes
fig1 = FigureWidget(make_subplots(rows=1, cols=1))
fig1.add_trace(go.Scatter(y=[], mode='lines+markers'), row=1, col=1)
fig1.update_layout(title="Number of Correct Answers Vs Questions Asked",
                  xaxis_title="Attempt",
                  yaxis_title="Number of Correct Answers")
display(fig1)  # Use display() for Jupyter Notebooks

def update_plot(correct_answers1):
    """Updates the plot with the latest number of correct answers."""
    openai_quetion_nunber_tracker.append(correct_answers1)
    x_values = list(range(1, len(openai_quetion_nunber_tracker) + 1))
    fig1.data[0].x = x_values
    fig1.data[0].y = openai_quetion_nunber_tracker

# Main loop for processing questions with LLM
for question_number in range(Questions_to_answer):
  
    # Extract keys from the data for each question
    keys = list(data[question_number].keys())
    input_message = "pre_text = " + str(data[question_number]['pre_text']) + \
                    "\npost_text = " + str(data[question_number]['post_text']) + \
                    "\ntable_ori = " + str(data[question_number]['table_ori']) + \
                    "\ntable = " + str(data[question_number]['table'])

    # Construct input message and collect answers for varying number of quetions for each section
    question_answers = ""
    if 'qa' in keys:
        input_message += "\nqa = " + str(data[question_number]['qa']['question'])
        question_answers = str(data[question_number]['qa']['answer']) + ", "
    else:
        max_index = max([int(key.split('_')[1]) for key in keys if key.startswith('qa')])
        for i in range(max_index + 1):
            question_key = f'qa_{i}'
            if question_key in data[question_number]:
                input_message += "\n" + str(question_key) + " = " + str(data[question_number][question_key]['question'])
                question_answers += str(data[question_number][question_key]['answer']) + ", "
                question_details.append("QUESTION NUMBER INFORMATION" + str(question_number) + "\n" + input_message + "\nCorrect Answer" + str(data[question_number][question_key]['answer']) + "\n")

    # OpenAI query and collect response
    messages = [{"role": "system", "content": prompt}, {"role": "user", "content": input_message}]
    completion = client.chat.completions.create(model= model_openai, messages=messages)
    assistant_response = completion.choices[0].message.content

    question_answers_list = [x.strip() for x in question_answers.split(',') if x.strip()]
    assistant_response_list = [x.strip() for x in assistant_response.split(',') if x.strip()]

    # Compare each answer within the tolerance and update results plot
    for actual, assistant in zip(question_answers_list, assistant_response_list):
        actual_value = float(re.search(r"-?\d+(\.\d+)?", actual).group()) if re.search(r"-?\d+(\.\d+)?", actual) else None
        assistant_value = float(re.search(r"-?\d+(\.\d+)?", assistant).group()) if re.search(r"-?\d+(\.\d+)?", assistant) else None

        # Checks if quetion number has appeard befor and if so adds a suffix to the quetion number  
        base_question_str = str(question_number + 1)
        if actual_value is not None and assistant_value is not None:
            if abs(actual_value - assistant_value) / abs(actual_value) <= tolerance:
                if base_question_str in openai_correct_answer_questions:
                    suffix = chr(97 + openai_correct_answer_questions.count(base_question_str))  # Start from 'b'
                    openai_correct_answer_questions.append(f"{base_question_str}{suffix}")
                else:
                    openai_correct_answer_questions.append(base_question_str)
            else:
                if base_question_str in openai_wrong_answer_questions:
                    suffix = chr(97 + openai_wrong_answer_questions.count(base_question_str))  # Start from 'b'
                    openai_wrong_answer_questions.append(f"{base_question_str}{suffix}")
                else:
                    openai_wrong_answer_questions.append(base_question_str)
            update_plot(len(openai_correct_answer_questions))
          
# Summarise final results to check LLM report results
print("Total correct answers: " + str(len(openai_correct_answer_questions)))
print("Correct answer questions: " + str(openai_correct_answer_questions))
print("Total wrong answers: " + str(len(openai_wrong_answer_questions)))
print("Wrong answer questions: " + str(openai_wrong_answer_questions))


# Auto Report Generation on LLM Performance

This section of the code is dedicated to generating a comprehensive report based on the output results of the Language Model (LLM) in response to the questions. It includes the calculation of system accuracies, a detailed analysis of the findings, and an honest discussion of the system's shortcomings. The report aims to provide insights into the LLM's performance, highlighting areas of success and identifying opportunities for improvement.

The report is generated using a LLM.


In [None]:
findings_input_message = "****Google Gemini Performance Report****" + \
               "\n Correct answers number: " + str(len(google_correct_answer_questions)) + \
               "\nCorrect answer questions: " + str(google_correct_answer_questions) + \
               "\nWrong answers number : " + str(len(google_wrong_answer_questions)) + \
               "\nWrong answer questions: " + str(google_wrong_answer_questions) + \
               "\n" + \
               "****OpenAI Chatgot Performance Report****" + \
               "\n Correct answers number: " + str(len(openai_correct_answer_questions)) + \
               "\nCorrect answer questions: " + str(openai_correct_answer_questions) + \
               "\nWrong answers number : " + str(len(openai_wrong_answer_questions)) + \
               "\nWrong answer questions: " + str(openai_wrong_answer_questions) +  \
               "\n" + \
               "\nQuetion number information: " + str(question_details)

assistant_response = str(model_gemini.generate_content([report_prompt, findings_input_message]).text)

print(assistant_response)

# Save the LLM report reponse as a pdf file
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="LLM Response Report", ln=True, align='C')
pdf.multi_cell(0, 10, assistant_response)
pdf_file_path = "AI_Assignment_Report.pdf"
pdf.output(pdf_file_path)