# LLM Collaboration (GPT + Gemini)

In [None]:
LLM_Report = {
  "GPT": {
    "History": "continuity and change over time",
    "US_Government": "Analyzing evidence",
    "Physics": "Analyzing evidence",
    "Human_Geography": "Contextualization",
    "Environmental_Science": "Analyzing evidence"
  },
  "Gemini": {
    "History": "Continuity and change over time",
    "US_Government": "Analyzing evidence",
    "Physics": "Causation",
    "Human_Geography": "Causation",
    "Environmental_Science": "Analyzing evidence"
  },
  "Claude": {
    "History": "Analyzing evidence",
    "US_Government": "Contextualization",
    "Physics": "Analyzing evidence",
    "Human_Geography": "Causation",
    "Environmental_Science": "Analyzing evidence"
  }
}

In [None]:
import os
import csv
import time
import re
import pandas as pd

In [None]:
from openai import OpenAI
client = OpenAI(api_key = "")

In [None]:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
genai.configure(api_key="")
model = genai.GenerativeModel('gemini-pro')

In [None]:
import anthropic
Claude_client = anthropic.Anthropic(api_key="")

# Import Test (question):
#### History Test = question[1]- question[55]
#### Government Test = question[1]- question[96]
#### Physics Test = question[1]- question[75]
#### Human Geo Test = question[1]- question[105]
#### Env Sci Test = question[1]- question[157]

In [None]:
subject = "history" # Please update to correct Subject
def load_questions_from_folder(folder_path):
    question = {}
    for i in range(1, 56):  # add 1 more to the last question number to include it!
        file_name = f"Question {i}.txt" #Add space between 'Question' and question number for history
        file_path = os.path.join(folder_path, file_name)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                question[i] = file.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='cp1252') as file:
                question[i] = file.read()
        except FileNotFoundError:
            print(f"File not found: {file_path}")
            continue  

    return question

folder_path = ""
question = load_questions_from_folder(folder_path)

# Import Answers:

In [None]:
Answers_path = "" #update here
try:
    with open(Answers_path, 'r', encoding='utf-8') as file:
        history_answers = file.read() #update here
except UnicodeDecodeError:
    with open(Answers_path, 'r', encoding='cp1252') as file:  
        history_answers = file.read() #update here

# Round 1:

## GPT-4

In [None]:
Round_1_GPT_answer = {}

for i in range(1, 56): # Add 1 more to the last question number to include it
    output = client.chat.completions.create(
            model='gpt-4',
            messages=[{"role": "user", "content": f"Please answer the questions with the question number and your letter of choice. Please remove parentheses from your letter of choice. Please provide a confidence level for your letter choice from 0% being the least likely correct answer choice to 100% being the most likely correct answer choice. Please also include a 1 sentence explanation for your choice justification. Please answer according to the example format: '1. A 90% because...' Now, please answer question {i}: \n{question[i]}\n"}],
        )
    Round_1_GPT_answer[i] = output.choices[0].message.content
    print(output.choices[0].message.content)

print("All questions processed.")

## Preview GPT round 1:

In [None]:
print(Round_1_GPT_answer[55]) #preview an answer

## Gemini Pro

In [None]:
#low_temperature_setting = genai.GenerationConfig(temperature= 0.1)
no_safety_settings = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
}
Round_1_Gemini_answer = {}

i = 1  # Start from the first question

while i <= 55:  # change based on the ACTUAL last question
    try:
        response = model.generate_content(
            contents=f"Please answer the questions with the question number and your letter of choice. Please remove parentheses from your letter of choice. Please provide a confidence level for your letter choice from 0% being the least likely correct choice to 100% being the most likely correct answer. Please also include a 1 sentence explanation for your choice justification. Please answer according to the example format: '1. A 90% because...' Now, please answer question {i}: \n{question[i]}\n",
            safety_settings= no_safety_settings
        )
        Round_1_Gemini_answer[i] = response.text
        print(response.text)
        i += 1  # Move to the next question only if successful
    except Exception as e:  # Catching a general exception
        error_message = str(e)
        if "429" in error_message or "rate limit" in error_message.lower():
            print(f"Rate limit error on question {i}: {error_message} - Sleeping longer and retrying...")
            time.sleep(4)  # Sleep longer if rate limit error occurs
        else:
            print(f"Error on question {i}: {error_message} - Retrying...")
    finally:
        time.sleep(3)  # Respect the API's request limit

print("All questions processed.")

## Preview Gemini Round 1:

In [None]:
print(Round_1_Gemini_answer[55]) #preview an answer

## Round 1 CSV:

#### Agreement: (Don't Download; still run!)

In [None]:
def evaluate_llm_agreement(GPT_answers, Gemini_answers):
    results = []
    agreement_pattern = re.compile(r'(\d+)\s*[.:]?\s*(?:\(([A-E])\)|([A-E]))', re.IGNORECASE)

    for question_number in range(1, 56):
        gpt_answer_text = GPT_answers[question_number]
        gemini_answer_text = Gemini_answers[question_number]

        # Extracting the first letter choice mentioned in their responses
        gpt_match = agreement_pattern.search(gpt_answer_text)
        gemini_match = agreement_pattern.search(gemini_answer_text)

        if gpt_match and gemini_match:
            # Compare the first letter found in each response
            gpt_choice = gpt_match.group().upper()
            gemini_choice = gemini_match.group().upper()
            correctness = 1 if gpt_choice == gemini_choice else 0
        else:
            # If either response does not contain a valid letter choice
            correctness = 0

        results.append((question_number, correctness))

    return results

results = evaluate_llm_agreement(Round_1_GPT_answer, Round_1_Gemini_answer)

def initialize_csv_file(filename):
    """Initialize the CSV file with headers."""
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Question Number", "Agreement (0/1)", "GPT Choice", "Gemini Choice"])

def update_csv(filename, results, GPT_answers, Gemini_answers):
    """Update the CSV file with LLM agreement results."""
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        agreement_pattern = re.compile(r'(\d+)\s*[.:]?\s*(?:\(([A-E])\)|([A-E]))', re.IGNORECASE)

        for question_number, correctness in results:
            gpt_match = agreement_pattern.search(GPT_answers[question_number])
            gemini_match = agreement_pattern.search(Gemini_answers[question_number])
            gpt_choice = gpt_match.group().upper() if gpt_match else "N/A"
            gemini_choice = gemini_match.group().upper() if gemini_match else "N/A"

            writer.writerow([question_number, correctness, gpt_choice, gemini_choice])

# Example usage
filename = f"All3_round_1_{subject}_Agree.csv"
initialize_csv_file(filename)
update_csv(filename, results, Round_1_GPT_answer, Round_1_Gemini_answer)
print(f'Agreement CSV: {filename}')

#### Grader: (Put correctness in Master XLSX)

In [None]:
def parse_answer_key(answers):
    answer_key = {}
    for line in answers.splitlines():
        parts = line.split()
        if len(parts) >= 3:
            question_number, correct_answer, skill = parts[0], parts[1], ' '.join(parts[2:])
            answer_key[question_number] = (correct_answer, skill)
    return answer_key

answers = history_answers # Please update 

def grade_llm_answers(csv_filename, answer_key):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_filename)

    # Add a new column for the truth (correct answer from the answer key)
    df['Truth'] = ""

    # Grading each response
    for index, row in df.iterrows():
        question_number = str(row['Question Number'])
        # Retrieve the correct answer and skill from the answer key
        if question_number in answer_key:
            correct_answer, skill_assessed = answer_key[question_number]
            df.at[index, 'Truth'] = correct_answer
            df.at[index, 'Skill Assessed'] = skill_assessed

            gpt_choice = extract_choice(str(row['GPT Choice']))
            gemini_choice = extract_choice(str(row['Gemini Choice']))

            # If GPT and Gemini choices agree and are valid choices
            if gpt_choice and gemini_choice and gpt_choice == gemini_choice:
                df.at[index, 'Correctness'] = int(gpt_choice == correct_answer)
            else:
                # If either LLM choice is invalid or they disagree
                df.at[index, 'Correctness'] = 0
        else:
            df.at[index, 'Correctness'] = 0
            df.at[index, 'Skill Assessed'] = "Question not in answer key"

    # Save the graded DataFrame back to a new CSV file
    graded_filename = csv_filename.replace('Agree.csv', 'graded.csv')
    df.to_csv(graded_filename, index=False)
    print(f"Graded CSV: {graded_filename}")

def extract_choice(choice_text):
    # Extract letter choice from strings like "1. C" or "5. C"
    match = re.match(r'\d+\.\s*([A-E])', choice_text, re.IGNORECASE)
    return match.group(1).upper() if match else None

answer_key = parse_answer_key(answers)
csv_filename = filename # Round_1_results
grade_llm_answers(csv_filename, answer_key)

# Round 2:

In [None]:
Round_2_GPT_answer = {}

for i in range(1, 56): # Add 1 more to the last question number to include it
    output = client.chat.completions.create(
            model='gpt-4',
            messages=[{"role": "user", "content": f"Please answer the questions with the question number and your letter of choice. Please remove parentheses from your letter of choice. Please provide a confidence level for your letter choice from 0% being the least likely correct answer choice to 100% being the most likely correct answer choice. Please also include a 1 sentence explanation for your choice justification. Please answer according to the example format: '1. A 90% because...' Now, please answer question {i}: \n{question[i]}\n based on Gemini's reponse:'{Round_1_Gemini_answer[i]}' \nand your own previous response:'{Round_1_GPT_answer[i]}'. \nLimit your choice to one of the previous responses"}],
        )
    Round_2_GPT_answer[i] = output.choices[0].message.content
    print(output.choices[0].message.content)

print("All questions processed.")

In [None]:
print(Round_2_GPT_answer[55]) #preview an answer

In [None]:
no_safety_settings = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
}
Round_2_Gemini_answer = {}

i = 1  # Start from the first question

while i <= 55:  # change based on last question
    try:
        response = model.generate_content(
            contents=f"Please answer the questions with the question number and your letter of choice. Please remove parentheses from your letter of choice. Please provide a confidence level for your letter choice from 0% being the least likely correct answer choice to 100% being the most likely correct answer choice. Please also include a 1 sentence explanation for your choice justification. Please answer according to the example format: '1. A 90% because...' Now, please answer question {i}: \n{question[i]}\n based on your previous reponse:'{Round_1_Gemini_answer[i]}' \nand GPT's response:'{Round_1_GPT_answer[i]}'. \nLimit your choice to one of the previous responses",
            safety_settings= no_safety_settings
        )
        Round_2_Gemini_answer[i] = response.text
        print(response.text)
        i += 1 
    except Exception as e:  # Catching a general exception
        error_message = str(e)
        if "429" in error_message or "rate limit" in error_message.lower():
            print(f"Rate limit error on question {i}: {error_message} - Sleeping longer and retrying...")
            time.sleep(4)  # Sleep longer if rate limit error occurs
        else:
            print(f"Error on question {i}: {error_message} - Retrying...")
    finally:
        time.sleep(3)  # Respect the API's request limit

print("All questions processed.")

In [None]:
print(Round_2_Gemini_answer[55]) #preview an answer

## Round 2 CSV:

In [None]:
def evaluate_llm_agreement(GPT_answers, Gemini_answers):
    results = []
    agreement_pattern = re.compile(r'(\d+)\s*[.:]?\s*(?:\(([A-E])\)|([A-E]))', re.IGNORECASE)

    for question_number in range(1, 56): # Add 1 more to the last question number to include it
        gpt_answer_text = GPT_answers[question_number]
        gemini_answer_text = Gemini_answers[question_number]

        # Extracting the first letter choice mentioned in their responses
        gpt_match = agreement_pattern.search(gpt_answer_text)
        gemini_match = agreement_pattern.search(gemini_answer_text)

        if gpt_match and gemini_match:
            # Compare the first letter found in each response
            gpt_choice = gpt_match.group().upper()
            gemini_choice = gemini_match.group().upper()
            correctness = 1 if gpt_choice == gemini_choice else 0
        else:
            # If either response does not contain a valid letter choice
            correctness = 0

        results.append((question_number, correctness))

    return results

results = evaluate_llm_agreement(Round_2_GPT_answer, Round_2_Gemini_answer)

def initialize_csv_file(filename):
    """Initialize the CSV file with headers."""
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Question Number", "Agreement (0/1)", "GPT Choice", "Gemini Choice"])

def update_csv(filename, results, GPT_answers, Gemini_answers):
    """Update the CSV file with LLM agreement results."""
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        agreement_pattern = re.compile(r'(\d+)\s*[.:]?\s*(?:\(([A-E])\)|([A-E]))', re.IGNORECASE)

        for question_number, correctness in results:
            gpt_match = agreement_pattern.search(GPT_answers[question_number])
            gemini_match = agreement_pattern.search(Gemini_answers[question_number])
            gpt_choice = gpt_match.group().upper() if gpt_match else "N/A"
            gemini_choice = gemini_match.group().upper() if gemini_match else "N/A"

            writer.writerow([question_number, correctness, gpt_choice, gemini_choice])

# Example usage
filename = f"GPT_Gemini_round_2_{subject}_Agree.csv"
initialize_csv_file(filename)
update_csv(filename, results, Round_2_GPT_answer, Round_2_Gemini_answer)
print(f'Agreement CSV: {filename}')

In [None]:
def parse_answer_key(answers):
    answer_key = {}
    for line in answers.splitlines():
        parts = line.split()
        if len(parts) >= 3:
            question_number, correct_answer, skill = parts[0], parts[1], ' '.join(parts[2:])
            answer_key[question_number] = (correct_answer, skill)
    return answer_key

answers = history_answers # Please update 

def grade_llm_answers(csv_filename, answer_key):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_filename)

    # Add a new column for the truth (correct answer from the answer key)
    df['Truth'] = ""

    # Grading each response
    for index, row in df.iterrows():
        question_number = str(row['Question Number'])
        # Retrieve the correct answer and skill from the answer key
        if question_number in answer_key:
            correct_answer, skill_assessed = answer_key[question_number]
            df.at[index, 'Truth'] = correct_answer
            df.at[index, 'Skill Assessed'] = skill_assessed

            gpt_choice = extract_choice(str(row['GPT Choice']))
            gemini_choice = extract_choice(str(row['Gemini Choice']))

            # If GPT and Gemini choices agree and are valid choices
            if gpt_choice and gemini_choice and gpt_choice == gemini_choice:
                df.at[index, 'Correctness'] = int(gpt_choice == correct_answer)
            else:
                # If either LLM choice is invalid or they disagree
                df.at[index, 'Correctness'] = 0
        else:
            df.at[index, 'Correctness'] = 0
            df.at[index, 'Skill Assessed'] = "Question not in answer key"

    # Save the graded DataFrame back to a new CSV file
    graded_filename = csv_filename.replace('Agree.csv', 'graded.csv')
    df.to_csv(graded_filename, index=False)
    print(f"Graded CSV: {graded_filename}")

def extract_choice(choice_text):
    # Extract letter choice from strings like "1. C" or "5. C"
    match = re.match(r'\d+\.\s*([A-E])', choice_text, re.IGNORECASE)
    return match.group(1).upper() if match else None

answer_key = parse_answer_key(answers)
csv_filename = filename # Round_2_results
grade_llm_answers(csv_filename, answer_key)

# Round 3:

In [None]:
def parse_skills_assessed(answers):
    skills_assessed = {}
    for line in answers.splitlines():
        parts = line.split()
        if len(parts) >= 3:
            question_number = parts[0]
            skill = ' '.join(parts[2:])
            skills_assessed[question_number] = skill
    return skills_assessed

skills_assessed = parse_skills_assessed(answers)


def parse_answer_key(answers):
    answer_key = {}
    for line in answers.splitlines():
        parts = line.split()
        if len(parts) >= 3:
            question_number, correct_answer, skill = parts[0], parts[1], ' '.join(parts[2:])
            answer_key[question_number] = (correct_answer, skill)
    return answer_key

def extract_letter_choice(text):
    match = re.search(r'\b([A-E])\b', text, re.IGNORECASE)
    return match.group(1).upper() if match else None

def initialize_csv_file(filename, subject):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([f"{subject}_Question Number", f"{subject}_Correctness", f"{subject}_Choice", f"{subject}_Skill Assessed"])

def update_csv(filename, answers, subject, answer_key):
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for i in range(1, 56):  # Add 1 more to the last question number to include it
            answer_text = answers[i] if i in answers else ""
            choice = extract_letter_choice(answer_text)
            correct = 1 if answer_key.get(str(i), ('', ''))[0] == choice else 0
            skill_assessed = answer_key.get(str(i), ('', ''))[1]
            writer.writerow([f"{subject}_{i}", correct, choice, skill_assessed])

def ask_claude_all_questions(subject, filename, answer_key):
    initialize_csv_file(filename, subject)  
    claude_answers = {}  

    for i in range(1, 56):  # Add 1 more to the last question number to include it
        message = Claude_client.messages.create(
            max_tokens=1000,
            messages=[{"role": "user", "content": f"Please answer the questions with only the question number and your letter of choice. Please remove parentheses from your letter of choice. Please answer according to the example format: '1. A' Now, please answer question {i}: \n{question[i]}\n based on Gemini's reponse:'{Round_2_Gemini_answer[i]}' \nand GPT's response:'{Round_2_GPT_answer[i]}'. The percentage provided for their responses is their confidence level from 0% being least likely to be correct and 100% being most likelt to be the correct answer. This question assess the skill of: {skills_assessed[str(i)]} \nYou are the judge and you can decide to choose which LLM's choice you want to choose. You can also decide to choose neither of their choices. Also decide based on each LLM's weaknesses:{LLM_Report}"}],
            model="claude-3-opus-20240229"
        )
        claude_answers[i] = message.content[0].text 
        print(f"Question {i}: {claude_answers[i]}")

    update_csv(filename, claude_answers, subject, answer_key) 
    print(f"Round 3 finalized. Results added to {filename}.")

filename = f"Claude_Round_3_{subject}_Round_3_Results.csv"
answers = history_answers  
answer_key = parse_answer_key(answers)
ask_claude_all_questions(subject, filename, answer_key)