In [1]:
import os
import sys
import pandas as pd
from openai import OpenAI
import numpy as np
import json
from tqdm import tqdm
from datasets import DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import faiss
import random

sys.path.append(os.path.abspath(".."))
from Feedback_agent.rubric_and_sample import IELTS_rubrics as rubric

client = OpenAI()

In [2]:
feedback_path = "./sample_essay.csv"
df = pd.read_csv(feedback_path, encoding='utf-8')

topic = df["topic"]
essay = df["essay"]
feedback = df["feedback"]
predicted_grade = df["predicted"]
desired_grade = df["desired"]
sample_grade = df["sample_score"]

In [None]:
def get_score_prompt_version(topic, essay):
    client = OpenAI()
    grader_prompt = f"""
    You are an IELTS writing section examiner. 
    Given the writing queston and the student essay, please grade the essay on a scale of 0 to 9 based on the IELTS Rubric and 0.5 intervals are allowed.

    Writing Question: {topic}
    Student Essay: {essay}

    Here is an IELTS rubric for your reference: 
    Rubric: 
    {rubric.BASIC_RUBRIC}
    {rubric.CRITERIA}
    {rubric.BAND_SCORE}

    Please output the score of the essay in the form of 'score of the essay'. Please output the score directly.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": grader_prompt}
        ]
    )
    return response.choices[0].message.content

In [4]:
def generate_embedding(text, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding")
        return None

In [5]:
index = faiss.read_index("../RAG/faiss_index_train.bin")
with open("../RAG/embeddings_dataset_train.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

In [6]:
def search_cosine_similarity(query_text, top_k=3):
    
    query_embedding = generate_embedding(query_text)
    if query_embedding is None:
        return []
    
    
    query_embedding_np = np.array([query_embedding], dtype=np.float32)
    faiss.normalize_L2(query_embedding_np)
    
    
    distances, indices = index.search(query_embedding_np, top_k)
    
    
    results = []
    for i, idx in enumerate(indices[0]):
        result = metadata[idx]
        result["similarity"] = distances[0][i]  
        results.append(result)
    return results


In [7]:
label_map = {
    0: '<4',
    1: 4.0,
    2: 4.5,
    3: 5.0,
    4: 5.5,
    5: 6.0,
    6: 6.5,
    7: 7.0,
    8: 7.5,
    9: 8.0,
    10: 8.5,
    11: 9.0
}

In [None]:
def get_score_prompt_version_RAG(topic, essay):
    
    results = search_cosine_similarity(f"Prompt: {topic}\nEssay: {essay}", top_k=3)

    grader_prompt = f"""
    You are an IELTS writing section examiner. 
    Given the writing queston and the student essay, please grade the essay on a scale of 0 to 9 based on the IELTS Rubric and 0.5 intervals are allowed.

    Writing Question: {topic}
    Student Essay: {essay}
    \n
    Here is an IELTS rubric for your reference: 
    Rubric: 
    {rubric.BASIC_RUBRIC}
    {rubric.CRITERIA}
    {rubric.BAND_SCORE}
    \n
    Here are some similar essays along with their scores for your reference, you can consider these scores as correct for sure:
    Essay 1: Writing Question: {results[0]['prompt']} Essay:  {results[0]['essay']} Score: {label_map[float(results[0]['label'])]}
    \n
    Essay 2: Writing Question: {results[1]['prompt']} Essay:  {results[1]['essay']} Score: {label_map[float(results[1]['label'])]}
    \n
    Essay 3: Writing Question: {results[2]['prompt']} Essay:  {results[2]['essay']} Score: {label_map[float(results[2]['label'])]}
    \n
    If there is very low co-relation between the question and the essay, please output 0 for score.
    Do not try to simply give an average score from your experience, for example, producing 5 or 6 or 7 for many essays.
    Please output the score of the essay in the form of 'score of the essay'. Please output the score directly. 
    """
    response = client.chat.completions.create(
        model="o1-preview",
        messages=[
            {"role": "user", "content": grader_prompt}
        ]
    )
    return response.choices[0].message.content

In [10]:
def get_acc_with_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        try:
            if pred == '<4' or truth == '<4':
                if pred == truth:
                    correct += 1
            elif float(pred) == float(truth) or abs(float(pred) - float(truth)) == 0.5:
                correct += 1
        except:
            total -= 1
            print(f"Conversion Error, Skipping pred: {pred}, truth: {truth}")
            continue
    return correct / total


def get_acc_no_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        try:
            if pred == '<4' or truth == '<4':
                if pred == truth:
                    correct += 1
            elif float(pred) == float(truth):
                correct += 1
        except:
            total -= 1
            print(f"Conversion Error, Skipping pred: {pred}, truth: {truth}")
            continue
    return correct / total
        


In [12]:
truth_list = predicted_grade
pred_list = []

for (q,e) in zip(topic, essay):
    pred_list.append(get_score_prompt_version_RAG(q, e))

print(pred_list)
print(truth_list)

KeyboardInterrupt: 

In [38]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/embeddings_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in range(100,120):
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version_RAG(curr_topic, curr_essay))
    cnt += 1
    if cnt == 20:
        break

print(pred_list)
print(truth_list)

['6.0', '5.0', '6.5', '7.5', '5.0', '6.5', '6.0', '7.0', '6.5', '6.5', '7.0', '6.5', '7.0', '6.5', 'Score of the essay: 4.0', '6.0', '5.0', '6.0', '8.0', '6.0']
[6.0, '<4', 6.5, 7.0, 4.0, 7.0, 7.0, 7.5, '<4', 7.5, 7.0, 5.0, 7.0, 7.5, 5.0, 9.0, 5.5, 7.5, 6.5, 5.5]


In [39]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

print(" ==== For predicting sample essay score from the desired score for essays in the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(sample_grade, desired_grade)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(sample_grade, desired_grade)}\n")


==== For predicting essay score from the dataset ====
Conversion Error, Skipping pred: Score of the essay: 4.0, truth: 5.0
accuracy with no tolerace: 0.2
Conversion Error, Skipping pred: Score of the essay: 4.0, truth: 5.0
accuracy with 0.5 tolerace: 0.45

 ==== For predicting sample essay score from the desired score for essays in the dataset ====
accuracy with no tolerace: 0.5714285714285714
accuracy with 0.5 tolerace: 1.0



In [40]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/embeddings_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in range(100,120):
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version(curr_topic, curr_essay))
    cnt += 1
    if cnt == 20:
        break

print(pred_list)
print(truth_list)

['6.0', '5.5', '6.5', '7.5', '5.5', '6.5', '5.5', '7.5', '7.5', '6.5', '7', '6.0', '8.0', '6.5', '3.5', '6.5', '5.5', '7.0', '8.5', '5.5']
[6.0, '<4', 6.5, 7.0, 4.0, 7.0, 7.0, 7.5, '<4', 7.5, 7.0, 5.0, 7.0, 7.5, 5.0, 9.0, 5.5, 7.5, 6.5, 5.5]


In [41]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

print(" ==== For predicting sample essay score from the desired score for essays in the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(sample_grade, desired_grade)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(sample_grade, desired_grade)}\n")

==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.3
accuracy with 0.5 tolerace: 0.45

 ==== For predicting sample essay score from the desired score for essays in the dataset ====
accuracy with no tolerace: 0.5714285714285714
accuracy with 0.5 tolerace: 1.0



test again


In [11]:
seed = 42
random.seed(seed)

random_numbers = random.sample(range(121), 20)  

print("Randomly selected numbers:", random_numbers)

Randomly selected numbers: [81, 14, 3, 94, 35, 31, 28, 17, 13, 86, 114, 69, 11, 75, 54, 4, 27, 29, 64, 77]


In [14]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/embeddings_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in random_numbers:
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version_RAG(curr_topic, curr_essay))
    cnt += 1
    if cnt == 20:
        break

print(pred_list)
print(truth_list)

['5.5', '6.0', '6.5', '5.0', '8.0', '0', '0', '5.0', '5.5', '7.0', '4.0', '5.0', '6.0', '5.0', '6.0', '6.0', '6.5', '6.0', '4.0', '5.5']
[6.5, 5.0, 9.0, '<4', 8.0, '<4', 6.5, 5.0, 7.5, 6.5, 5.0, 7.0, 6.0, 6.0, 5.0, 6.5, 7.0, 5.0, 4.0, 7.0]


In [16]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.2
accuracy with 0.5 tolerace: 0.35



In [31]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/embeddings_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in random_numbers:
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version(curr_topic, curr_essay))
    cnt += 1
    if cnt == 20:
        break

print(pred_list)
print(truth_list)

['6.0', '6.0', '7.5', '6', '7.5', '0', '0', '5.5', '5.5', '7.0', '3.5', '5.5', '6.5', '5.5', '6.0', '6.0', '6.5', '6.5', '5.0', '6.0']
[6.5, 5.0, 9.0, '<4', 8.0, '<4', 6.5, 5.0, 7.5, 6.5, 5.0, 7.0, 6.0, 6.0, 5.0, 6.5, 7.0, 5.0, 4.0, 7.0]


In [26]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.1
accuracy with 0.5 tolerace: 0.45

