In [None]:
import os
import sys
import pandas as pd
from openai import OpenAI
import numpy as np
import json
from tqdm import tqdm
from datasets import DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import faiss
import random

sys.path.append(os.path.abspath(".."))
from Feedback_agent.rubric_and_sample import IELTS_rubrics as rubric

client = OpenAI()

feedback_path = "./sample_essay.csv"
df = pd.read_csv(feedback_path, encoding='utf-8')

topic = df["topic"]
essay = df["essay"]
feedback = df["feedback"]
predicted_grade = df["predicted"]
desired_grade = df["desired"]
sample_grade = df["sample_score"]

def generate_embedding(text, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding")
        return None


label_map = {
    0: '<4',
    1: 4.0,
    2: 4.5,
    3: 5.0,
    4: 5.5,
    5: 6.0,
    6: 6.5,
    7: 7.0,
    8: 7.5,
    9: 8.0,
    10: 8.5,
    11: 9.0
}


def search_combined_similarity(query_prompt, query_essay,  
                                topic_weight, essay_weight, top_k):
    
    essay_index = faiss.read_index("../RAG/faiss_index_train_essay.bin")
    prompt_index = faiss.read_index("../RAG/faiss_index_train_topics.bin")
    with open("../RAG/embeddings_dataset_train_essay.json", "r", encoding="utf-8") as f:
        essay_metadata = json.load(f)
    with open("../RAG/embeddings_dataset_train_topics.json", "r", encoding="utf-8") as f:
        prompt_metadata = json.load(f)


    
    query_prompt_embedding = generate_embedding(query_prompt)
    query_essay_embedding = generate_embedding(query_essay)
    
    if query_prompt_embedding is None or query_essay_embedding is None:
        return []
    
    
    query_prompt_embedding_np = np.array([query_prompt_embedding], dtype=np.float32)
    query_essay_embedding_np = np.array([query_essay_embedding], dtype=np.float32)
    faiss.normalize_L2(query_prompt_embedding_np)
    faiss.normalize_L2(query_essay_embedding_np)
    
    
    prompt_distances, prompt_indices = prompt_index.search(query_prompt_embedding_np, len(prompt_metadata))
    essay_distances, essay_indices = essay_index.search(query_essay_embedding_np, len(essay_metadata))

    
    # resort according to the original indices 0,1,...
    prompt_sort_indices = np.argsort(prompt_indices[0])
    prompt_indices_sorted = prompt_indices[0][prompt_sort_indices]
    prompt_distances_sorted = prompt_distances[0][prompt_sort_indices]
    

    
    essay_sort_indices = np.argsort(essay_indices[0])
    essay_indices_sorted = essay_indices[0][essay_sort_indices]
    essay_distances_sorted = essay_distances[0][essay_sort_indices]

    #(index, pormpt_distance, essay_distance)
    prompt_essay_pair = list(zip(essay_indices_sorted,prompt_distances_sorted,essay_distances_sorted))

    sorted_prompt_essay_pair = sorted(prompt_essay_pair, key=lambda x: topic_weight * x[1] + essay_weight * x[2], reverse=True)[0:top_k]

    
    

    results = []
    for i, x in enumerate(sorted_prompt_essay_pair):
        idx = x[0]
        sim = topic_weight * x[1] + essay_weight * x[2]
        result = essay_metadata[idx]
        result["similarity"] = sim 
        results.append(result)
    return results
    




def get_score_prompt_version_RAG(topic, essay,topic_weight, essay_weight, top_k):
    
    results = search_combined_similarity(topic,essay, topic_weight=topic_weight, essay_weight=essay_weight,top_k=top_k)

    grader_prompt = f"""
    You are an IELTS writing section examiner, tasked with evaluating essays strictly based on the official IELTS writing rubric.

    **Writing Question**: {topic}
    **Student Essay**: {essay}
    \n
    **Reference IELTS Rubric**:
    {rubric.BASIC_RUBRIC}
    {rubric.CRITERIA}
    {rubric.BAND_SCORE}
    \n
    **Reference Essays for Consistency**:
    - Example 1: Writing Question: {results[0]['prompt']} Essay: {results[0]['essay']} Score: {label_map[float(results[0]['label'])]}
    \n
    - Example 2: Writing Question: {results[1]['prompt']} Essay: {results[1]['essay']} Score: {label_map[float(results[1]['label'])]}
    \n
    - Example 3: Writing Question: {results[2]['prompt']} Essay: {results[2]['essay']} Score: {label_map[float(results[2]['label'])]}
    \n
    **Guidelines for Scoring**:
    - Assign scores in 0.5 intervals from 0 to 9 based on the IELTS rubric.
    - If the essay content is irrelevant or off-topic, assign a score of 0.
    - Avoid generic scores like 5, 6, or 7 unless the essay fully justifies such a rating.
    - Use the provided example essays and their scores to guide your grading and ensure consistency.

    **Enhanced Scoring Process**:
    - Generate multiple scores for the same essay by slightly varying the context or examples provided (e.g., shuffle or modify reference essays where appropriate).
    - Compute the average of these scores to improve reliability.
    - If scores vary significantly, consider revisiting the rubric alignment for the given essay.

    **Final Output**:
    - Output only the final averaged score directly, only the score number, but if the score is smaller than 4, output only the '<4'. Don't be too strict on the score, be more flexible.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": grader_prompt}
        ]
    )
    return response.choices[0].message.content


def get_acc_with_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        if '<4' in pred:
            pred = "<4"
        elif float(pred) <= 4:
            pred = '<4'
        try:
            if pred == '<4' or truth == '<4':
                if pred == truth:
                    correct += 1 
            elif float(pred) == float(truth) or abs(float(pred) - float(truth)) == 0.5:
            #elif float(pred) == float(truth) or abs(float(pred) - float(truth)) <= 1.0:
                correct += 1
        except:
            total -= 1
            print(f"Conversion Error, Skipping pred: {pred}, truth: {truth}")
            continue
    return correct / total


def get_acc_no_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        if '<4' in pred:
            pred = "<4"
        elif float(pred) < 4:
            pred = '<4'
        try:
            if pred == '<4' or truth == '<4':
                if pred == truth:
                    correct += 1
            elif float(pred) == float(truth):
                correct += 1
        except:
            total -= 1
            print(f"Conversion Error, Skipping pred: {pred}, truth: {truth}")
            continue
    return correct / total
        


In [2]:
topic =""""prompt": "It is more important to spend public money on promoting a healthy lifestyle in order to prevent illness than to spend it on treatment of people who are already ill. To what extent do you agree or disagree?"
"""
essay = """"essay": "In this day and age, more and more contemporary attention has been placed on the opinion that allocating public money for enhancing a robust lifestyle aiming at staving diseases off is more necessary than allocating it for the treatment of people who are already ill. From my perspective, I subscribe to the former idea. \r\n\r\nTo begin with, by virtue of having a healthy lifestyle, people may reduce medical expenses, saving a great deal of money, therefore, investing public money in promoting a robust one is appropriate and accurate. For instance, if public money is spent on raising inhabitants’ awareness of the adverse health effects of smoking, they will be inclined to avoid using cigarettes and thanks to this, they will be less susceptible to lung cancer, so they can cut down on medical expenses. \r\n\r\nIn addition, the best method to prevent any illness is to develop a healthy lifestyle. This can be explained by the reason that when inhabitants may improve their well-being or have a good physique, they might build up immunity against many diseases, leading to the possibility that the risks of suffering from many illnesses will be lessened. For instance, the Japanese government has invested public money in running health campaigns to incentivize their residents to have a healthy diet. Thus, nowadays, Japanese people seldom develop diseases that can put them in life-threatening situations. \r\n\r\nIn conclusion, with all the reasons mentioned above, I strongly believe that it is better to spend public money on promoting a robust lifestyle in order to prevent illnesses."
"""
results = search_combined_similarity(topic,essay, topic_weight=0.3, essay_weight=0.7,top_k=3)

In [3]:
for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"  Prompt: {result['prompt']}")
    print(f"  Essay: {result['essay']}")
    print(f"  Label: {result['label']}")
    print(f"  Similarity: {result['similarity']}")
    print()

Result 1:
  Prompt: It is more important to spend public money on promoting a healthy lifestyle in order to prevent illness than to spend it on treatment of people who are already ill. To what extent do you agree or disagree?
  Essay: In this day and age, more and more contemporary attention has been placed on the opinion that allocating public money for enhancing a robust lifestyle aiming at staving diseases off is more necessary than allocating it for the treatment of people who are already ill. From my perspective, I subscribe to the former idea. 

To begin with, by virtue of having a healthy lifestyle, people may reduce medical expenses, saving a great deal of money, therefore, investing public money in promoting a robust one is appropriate and accurate. For instance, if public money is spent on raising inhabitants’ awareness of the adverse health effects of smoking, they will be inclined to avoid using cigarettes and thanks to this, they will be less susceptible to lung cancer, so

t-0.4,e-0.6,k-3

In [2]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/manual_embedding_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in range(30):
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version_RAG(curr_topic, curr_essay))
    cnt += 1
    #if cnt == 20:
       # break

print(pred_list)
print(truth_list)

['7', '6.0', '6.5', '6.0', '6.0', '5.0', '6.5', '6.5', '6.0', '7.0', '7.0', '5.5', '<4', '<4', '6.5', '5', '6.5', '6.0', '6.0', '6.5', '6.0', '7.0', '<4>', '5.5', '7.5', '6.5', '7.0', '5.0', '8.0', '<4>']
[7.5, 7.0, 9.0, 6.5, 6.5, 5.0, 6.5, 6.0, '<4', 8.0, 5.5, 5.0, '<4', '<4', 6.5, 5.0, 7.5, 6.0, 4.5, 5.0, 5.0, 7.5, 7.5, 8.0, 6.0, 7.0, 7.5, 6.0, 7.5, '<4']


In [3]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.26666666666666666
accuracy with 0.5 tolerace: 0.5666666666666667



t-0.2,e-0.8,k-5

In [4]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/manual_embedding_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in range(30):
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version_RAG(curr_topic, curr_essay))
    cnt += 1
    #if cnt == 20:
       # break

print(pred_list)
print(truth_list)

['6.0', '6.0', '6.5', '6.0', '6', '5.0', '7.0', '5.5', '6.5', '6.0', '7.0', '6.0', '<4', '<4', '6.0', '4.5', '6.5', '6', '7.0', '6.5', '5.5', '7.0', '<4>', '5.5', '7.5', '6.0', '7.0', '5.0', '8.0', '<4>']
[7.5, 7.0, 9.0, 6.5, 6.5, 5.0, 6.5, 6.0, '<4', 8.0, 5.5, 5.0, '<4', '<4', 6.5, 5.0, 7.5, 6.0, 4.5, 5.0, 5.0, 7.5, 7.5, 8.0, 6.0, 7.0, 7.5, 6.0, 7.5, '<4']


In [5]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.16666666666666666
accuracy with 0.5 tolerace: 0.5



t-0.3,e-0.7,k-3

In [8]:
truth_list = []
pred_list = []
cnt = 0
with open("../RAG/manual_embedding_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in range(30):
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_list.append(get_score_prompt_version_RAG(curr_topic, curr_essay, topic_weight=0.3, essay_weight=0.7, top_k=3))
    cnt += 1
    #if cnt == 20:
       # break

print(pred_list)
print(truth_list)
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_list, truth_list)}\n")

IndexError: list index out of range