In [1]:
import os
import sys
import pandas as pd
from openai import OpenAI
import numpy as np
import json
from tqdm import tqdm
from datasets import DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import faiss
import random

sys.path.append(os.path.abspath(".."))
from Feedback_agent.rubric_and_sample import IELTS_rubrics as rubric

client = OpenAI()

In [2]:
def get_score_prompt_version(topic, essay):
    client = OpenAI()
    grader_prompt = f"""
    You are an IELTS writing section examiner, tasked with evaluating essays strictly based on the official IELTS writing rubric.

    **Writing Question**: {topic}
    **Student Essay**: {essay}
    \n
    **Reference IELTS Rubric**:
    {rubric.BASIC_RUBRIC}
    {rubric.CRITERIA}
    {rubric.BAND_SCORE}
    \n
    **Guidelines for Scoring**:
    - Assign scores in 0.5 intervals from 0 to 9 based on the IELTS rubric.
    - If the essay content is irrelevant or off-topic, assign a score of 0.
    - Avoid generic scores like 5, 6, or 7 unless the essay fully justifies such a rating.
    - Use the provided example essays and their scores to guide your grading and ensure consistency.

    **Enhanced Scoring Process**:
    - Generate multiple scores for the same essay by slightly varying the context or examples provided (e.g., shuffle or modify reference essays where appropriate).
    - Compute the average of these scores to improve reliability.
    - If scores vary significantly, consider revisiting the rubric alignment for the given essay.

    **Final Output**:
    - Output only the final averaged score directly, only the score number, if the score is smaller than 4, output only the '<4' instead of the score.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": grader_prompt}
        ]
    )
    return response.choices[0].message.content

In [3]:
def generate_embedding(text, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding")
        return None

In [4]:
#index = faiss.read_index("../RAG/faiss_index_train.bin")
index = faiss.read_index("../RAG/faiss_index_train_topics.bin")
#with open("../RAG/embeddings_dataset_train.json", "r", encoding="utf-8") as f:
with open("../RAG/embeddings_dataset_train_topics.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

In [5]:
def search_cosine_similarity(query_text, top_k=3):
    
    query_embedding = generate_embedding(query_text)
    if query_embedding is None:
        return []
    
    
    query_embedding_np = np.array([query_embedding], dtype=np.float32)
    faiss.normalize_L2(query_embedding_np)
    
    
    distances, indices = index.search(query_embedding_np, top_k)
    
    
    results = []
    for i, idx in enumerate(indices[0]):
        result = metadata[idx]
        result["similarity"] = distances[0][i]  
        results.append(result)
    return results


In [6]:
label_map = {
    0: '<4',
    1: 4.0,
    2: 4.5,
    3: 5.0,
    4: 5.5,
    5: 6.0,
    6: 6.5,
    7: 7.0,
    8: 7.5,
    9: 8.0,
    10: 8.5,
    11: 9.0
}

##### reference: https://ielts.idp.com/canada/prepare/article-ielts-writing-task-2-8-steps-to-band-8

In [31]:
def get_score_prompt_version_RAG(topic, essay):
    
    results = search_cosine_similarity(f"Prompt: {topic}\nEssay: {essay}", top_k=3)

    grader_prompt = f"""
    You are a professional IELTS writing section examiner, tasked to grade the student essay accurately based on the official IELTS writing rubric, but there should be some space for leniency.

    **Writing Question**: {topic}
    **Student Essay**: {essay}
    \n
    **Reference IELTS Rubric**:
    {rubric.BASIC_RUBRIC}
    {rubric.CRITERIA}
    {rubric.BAND_SCORE}
    \n
    **Reference Essays for Consistency**:
    - Example 1: Writing Question: {results[0]['prompt']} Essay: {results[0]['essay']} Score: {label_map[float(results[0]['label'])]}
    \n
    - Example 2: Writing Question: {results[1]['prompt']} Essay: {results[1]['essay']} Score: {label_map[float(results[1]['label'])]}
    \n
    - Example 3: Writing Question: {results[2]['prompt']} Essay: {results[2]['essay']} Score: {label_map[float(results[2]['label'])]}
    \n

    **Guidelines for Scoring**:
    - Assign scores in 0.5 intervals from 0 to 9 based on the IELTS rubric.
    - If the student essay is irrelevant or off-topic from the Writing Question, assign a score of 0.
    - Avoid generic scores like 5, 6, or 7 unless the student essay fully justifies such a rating.
    - Use the provided example essays and their scores to guide your grading and ensure consistency.

    **Special Consideration for Student Essays to Achieve High Scores**:
    - The student essay should be a formal response to the writing question. 
    - Make sure the student essay is logical and progresses clearly with a wide range of linking words and phrases. But the student essay should avoid an overuse of the same linking words. 
    - Check for repetitions of words, the student essay should cover a diverse range of vocabularies. 
    - The ideas MUST be organised into paragraphs with introductions of arguments, examples that support the student essay's viewpoint, explanations of why these examples are valid, and great transitions to the next topic or paragraph. 
    - The student essay should have sufficient amount of paragraphs to show structured response. 
    - Each topic or argument should have its own paragraph. The introduction and the conclusion should also be included as separated paragraphs. 
    - The student essay should use a wide range of vocabularies and an adequate amount of uncommon words. 
    - The student essay should use a wide range of sentence structures with accurate punctuation. The student essay should contain a variety of simple and complex sentence structures. 

    **Enhanced Scoring Process**:
    - Generate multiple scores for the same student essay by slightly varying the context or examples provided (e.g., shuffle or modify reference essays where appropriate).
    - Compute the average of these scores to improve reliability.
    - If scores vary significantly, consider revisiting the rubric alignment for the given student essay.
    - A high scoring student essay (scores ranging from 7.5 to 9) DO NOT need to fulfil all the requirements in "Special Consideration for Student Essays to Achieve High Scores" section. 

    **Final Output**:
    - Output only the final averaged score directly, only the score number, but if the score is smaller than 4, output only the '<4'.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": grader_prompt}
        ]
    )
    return response.choices[0].message.content

In [32]:
def get_acc_with_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        if '<4' in pred:
            pred = "<4"
        elif float(pred) <= 4:
            pred = '<4'
        try:
            if pred == '<4' or truth == '<4':
                if pred == truth:
                    correct += 1 
            elif float(pred) == float(truth) or abs(float(pred) - float(truth)) == 0.5:
            #elif float(pred) == float(truth) or abs(float(pred) - float(truth)) <= 1.0:
                correct += 1
        except:
            total -= 1
            print(f"Conversion Error, Skipping pred: {pred}, truth: {truth}")
            continue
    return correct / total


def get_acc_no_tol(pred_list, truth_list):
    total = len(truth_list)
    correct = 0
    for (pred, truth) in zip(pred_list, truth_list):
        if '<4' in pred:
            pred = "<4"
        elif float(pred) < 4:
            pred = '<4'
        try:
            if pred == '<4' or truth == '<4':
                if pred == truth:
                    correct += 1
            elif float(pred) == float(truth):
                correct += 1
        except:
            total -= 1
            print(f"Conversion Error, Skipping pred: {pred}, truth: {truth}")
            continue
    return correct / total
        


In [33]:
truth_list = []
pred_RAG_list = []
pred_prompt_list = []
cnt = 0
with open("../RAG/manual_embedding_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

for i in range(40):
    curr_topic = metadata_test[i]['prompt']
    curr_essay = metadata_test[i]['essay']
    truth_list.append(label_map[float(metadata_test[i]['label'])])
    pred_RAG_list.append(get_score_prompt_version_RAG(curr_topic, curr_essay))
    cnt += 1
    # if cnt == 10:
    #    break

print(pred_RAG_list)
print(truth_list)

['6.5', '6.0', '6.5', '5.5', '5.0', '5.0', '6.5', '5.5', '<4', '7.0', '7.0', '5.5', '<4', '0', '6.5', '5.5', '6.0', '6.0', '7.0', '6.0', '5.5', '7.0', '<4', '6.0', '6.0', '6.0', '6.5', '5.0', '8.0', '<4', '7.5', '7.5', '6.0', '6.5', '5.0', '5.0', '6.5', '6.5', '<4', '6.5']
[7.5, 7.0, 9.0, 6.5, 6.5, 5.0, 6.5, 6.0, '<4', 8.0, 5.5, 5.0, '<4', '<4', 6.5, 5.0, 7.5, 6.0, 4.5, 5.0, 5.0, 7.5, 7.5, 8.0, 6.0, 7.0, 7.5, 6.0, 7.5, '<4', 6.0, 8.0, 7.5, 6.0, 4.0, 7.5, 6.0, 7.5, '<4', 7.0]


In [34]:
print("==== For predicting essay score from the dataset ====")
print(f"accuracy with no tolerace: {get_acc_no_tol(pred_RAG_list, truth_list)}")
print(f"accuracy with 0.5 tolerace: {get_acc_with_tol(pred_RAG_list, truth_list)}\n")

==== For predicting essay score from the dataset ====
accuracy with no tolerace: 0.25
accuracy with 0.5 tolerace: 0.5

