In [2]:
# IELTS Essay Evaluation - First Version
# Datasets: train.csv, test.csv, preference_data.csv
# Models: Google Gemini API + Open-source HuggingFace LLM


import os
import pandas as pd
from dotenv import load_dotenv


# 1. Load Datasets


TRAIN_PATH = "/Users/kalpanapullagura/Downloads/Automated-IELTS-essay-evaluation-master/data/train.csv"
TEST_PATH = "/Users/kalpanapullagura/Downloads/Automated-IELTS-essay-evaluation-master/data/test.csv"
PREF_PATH = "/Users/kalpanapullagura/Downloads/Automated-IELTS-essay-evaluation-master/data/preference_data.csv"

train_df = pd.read_csv(TRAIN_PATH, lineterminator="\n")
test_df = pd.read_csv(TEST_PATH, lineterminator="\n")
pref_df = pd.read_csv(PREF_PATH, lineterminator="\n")



train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()
pref_df.columns = pref_df.columns.str.strip()


print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Preference data shape:", pref_df.shape)

print("\nCleaned test_df columns:", test_df.columns)


# 2. Define Evaluation Prompt

evaluation_prompt = """
The score of the essay below is **{}**. Evaluate this essay that aligns with the score provided.

## Task Achievement:
- How well the candidate addresses the task.

## Coherence and Cohesion:
- Logical flow, transitions, structure.

## Lexical Resource:
- Vocabulary range, mistakes, and suggestions.

## Grammatical Range and Accuracy:
- Grammar range and errors.

## Overall Band Score:
- Suggested band score.

## Essay Prompt:
{}

## Essay:
{}

## Evaluation:
"""



# 3A. Gemini API Evaluation

def evaluate_with_gemini(prompt, essay, band):
    import google.generativeai as genai
    load_dotenv()
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

    model = genai.GenerativeModel("models/gemini-1.5-flash-latest") 
    full_prompt = evaluation_prompt.format(band, prompt, essay)
    response = model.generate_content(full_prompt)
    return response.text.strip()



# 3B. Open-source LLM Evaluation (HuggingFace)

def evaluate_with_local_llm(prompt, essay, band):
    from transformers import pipeline
    #  First version = small model (gpt2) just to demonstrate pipeline
    # Future work = replace with Mistral/LLaMA-2 and fine-tune
    local_model = pipeline("text-generation", model="gpt2")

    full_prompt = evaluation_prompt.format(band, prompt, essay)
    output = local_model(full_prompt, max_length=400, do_sample=True)
    return output[0]["generated_text"]


# 4. Run Evaluation on Samples

results = []

for idx, row in test_df.head(5).iterrows():   # run only 5 for demo
    essay_prompt = row["prompt"]
    essay_text = row["essay"]
    essay_band = row["band"]

    print(f"\n=== Essay {idx} (Gold Band {essay_band}) ===")

    # Gemini evaluation
    try:
        gemini_eval = evaluate_with_gemini(essay_prompt, essay_text, essay_band)
        print("[Gemini Eval Done ]")
    except Exception as e:
        gemini_eval = f"Gemini error: {e}"
        print("[Gemini Eval Failed ]")

    # Local LLM evaluation
    try:
        local_eval = evaluate_with_local_llm(essay_prompt, essay_text, essay_band)
        print("[Local LLM Eval Done ]")
    except Exception as e:
        local_eval = f"Local LLM error: {e}"
        print("[Local LLM Eval Failed ]")

    results.append({
        "prompt": essay_prompt,
        "essay": essay_text,
        "gold_band": essay_band,
        "gemini_evaluation": gemini_eval,
        "local_llm_evaluation": local_eval
    })



# 5. Save Results

out_path = "first_version_results.csv"
pd.DataFrame(results).to_csv(out_path, index=False)
print("\n Results saved to:", out_path)



# 6. Future Scope (Preference Data)


print("\n[Future Scope]")
print(" Fine-tune open-source LLMs with train.csv")
print(" Evaluate performance on test.csv")
print(" Use preference_data.csv for RLHF training")
print(" Build web app for deployment (Streamlit/Flask)")


Train shape: (9912, 4)
Test shape: (495, 4)
Preference data shape: (1762, 4)

Cleaned test_df columns: Index(['prompt', 'essay', 'evaluation', 'band'], dtype='object')

) ===ssay 0 (Gold Band 7.5
[Gemini Eval Done ]


Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=400) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[Local LLM Eval Done ]

) ===ssay 1 (Gold Band 6
[Gemini Eval Done ]


Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=400) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[Local LLM Eval Done ]

) ===ssay 2 (Gold Band <4
[Gemini Eval Done ]


Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=400) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[Local LLM Eval Done ]

) ===ssay 3 (Gold Band 6
[Gemini Eval Done ]


Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=400) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[Local LLM Eval Done ]

) ===ssay 4 (Gold Band 6
[Gemini Eval Done ]


Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=400) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[Local LLM Eval Done ]

 Results saved to: first_version_results.csv

[Future Scope]
 Fine-tune open-source LLMs with train.csv
 Evaluate performance on test.csv
 Use preference_data.csv for RLHF training
 Build web app for deployment (Streamlit/Flask)
