In [14]:
import os
import sys
import pandas as pd
from openai import OpenAI
import numpy as np

sys.path.append(os.path.abspath(".."))
from Feedback_agent.rubric_and_sample import IELTS_rubrics as rubric

In [17]:
feedback_path = "./sample_essay.csv"
df = pd.read_csv(feedback_path, encoding='utf-8')

topic = df["topic"]
essay = df["essay"]
feedback = df["feedback"]
predicted_grade = df["predicted"]
desired_grade = df["desired"]
sample_grade = df["sample_score"]

In [18]:
def get_score_prompt_version(topic, essay):
    client = OpenAI()
    grader_prompt = f"""
    You are an IELTS writing section examiner. 
    Given the writing queston and the student essay, please grade the essay on a scale of 0 to 9 based on the IELTS Rubric and 0.5 intervals are allowed.

    Writing Question: {topic}
    Student Essay: {essay}

    Here is an IELTS rubric for your reference: 
    Rubric: 
    {rubric.BASIC_RUBRIC}
    {rubric.CRITERIA}
    {rubric.BAND_SCORE}

    Please output the score of the essay in the form of 'score of the essay'. Please output the score directly.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": grader_prompt}
        ]
    )
    return response.choices[0].message.content

In [19]:
def get_acc_with_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        if pred == '<4' or truth == '<4':
            if pred == truth:
                correct += 1
        elif float(pred) == float(truth) or abs(float(pred) - float(truth)) == 0.5:
            correct += 1
    return correct / total


def get_acc_no_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        if pred == '<4' or truth == '<4':
            if pred == truth:
                correct += 1
        elif float(pred) == float(truth):
            correct += 1
    return correct / total
        


In [20]:
truth_list = predicted_grade
pred_list = []

for (q,e) in zip(topic, essay):
    pred_list.append(get_score_prompt_version(q, e))

print(pred_list)
print(truth_list)

['5.0', '8.0', '6.0', '7.0', '7', '4.5', '6.0', '7.0', '5.5', '7', '7.5', '6.5', '5.0', '6.0']
0     5.0
1     5.5
2     5.5
3     5.0
4     7.5
5     6.5
6     4.5
7     7.0
8     6.0
9     7.0
10    7.5
11    7.0
12    6.0
13    5.5
Name: predicted, dtype: float64


In [23]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

print(" ==== For predicting sample essay score from the desired score for essays in the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(sample_grade, desired_grade)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(sample_grade, desired_grade)}\n")


==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.2857142857142857
accuracy with 0.5 tolerace: 0.6428571428571429

 ==== For predicting sample essay score from the desired score for essays in the dataset ====
accuracy with no tolerace: 0.5714285714285714
accuracy with 0.5 tolerace: 1.0

