In [52]:
import os
import torch
from openai import OpenAI
import pandas as pd
from sklearn.metrics import mean_squared_error
client = OpenAI()

In [37]:
csv_file_path = os.path.join("data", "dataset", "processed", "clean_data.csv")
jsonl_file_path = os.path.join("data", "dataset", "processed", "clean_data.jsonl")
train_data_path = os.path.join("..", "data", "dataset", "processed", "train.jsonl")
val_data_path = os.path.join("..", "data", "dataset", "processed", "validation.jsonl")

In [38]:
train_file = client.files.create(
  file=open(train_data_path, "rb"),
  purpose="fine-tune"
)

test_file = client.files.create(
  file=open(val_data_path, "rb"),
  purpose="fine-tune"
)

In [39]:
#Finetuning on mini dataset
job = client.fine_tuning.jobs.create(
  training_file=train_file.id,
  validation_file=test_file.id,
  model="gpt-4o-mini-2024-07-18",
    hyperparameters={
    "n_epochs":3
  }
)

In [41]:
status = client.fine_tuning.jobs.retrieve(job.id)
print(status)

FineTuningJob(id='ftjob-ZMpfVKMUq0c51IZFqW44aA7b', created_at=1731530796, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:personal::ATFE3srB', finished_at=1731533401, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-ddAZbaTdTMt3bBOAwSkg0NcQ', result_files=['file-0HNOcPjevGzBpmPuynA2ai9N'], seed=400058700, status='succeeded', trained_tokens=735663, training_file='file-SIGRXh9lLqDxSishdOu361KW', validation_file='file-Q9gXoexRdQXVvvyyNDFhk60O', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [42]:
fine_tune_results = client.fine_tuning.jobs.retrieve(job.id).result_files
result_file = client.files.retrieve(fine_tune_results[0])
content = client.files.content(result_file.id)
import base64

base64.b64decode(content.text.encode("utf-8"))

with open('band_score_result.csv', "wb") as f:
    f.write(base64.b64decode(content.text.encode("utf-8")))



In [None]:
completion = client.chat.completions.create(
  model=status.fine_tuned_model,
  messages=[
    {"role": "system", "content": "You are a IELTS writing part examiner who is responsible to provide a band score given an essay"}, 
    {"role": "user", "content": "Question: Some people think that all teenagers should be required to do unpaid work in their free time to help the local community. They believe this would benefit both the individual teenager and society as a whole. Do you agree or disagree?\nEssay: Many people work on a volunteer basis, and this can only be beneficial for both individuals and society as a whole. Personally, I disagree with this opinion.\nWorking as a volunteer has become increasingly popular among teenagers. I agree that this can help them to fulfill their free time. On the other hand, studying is more important than doing unpaid work and most of them find it extremely hard as a compulsory studying. If individual teenagers had free time, they would do sports instead of doing unpaid work. They have many years of work ahead of them when they finish their studies. \nOn the other hand, those who think that all teenagers should be required to do unpaid work, they pressure on them and it can cause resentment amongst teenagers and society. Furthermore, I do not agree that society would benefit from unpaid work as a whole. To force young people to work as a volunteer could be counterproductive and parents would be dissatisfied how to rise their children. Encorcement on teenagers can gain nothing for the young and society. For example, recent survey on how enforcement can be devastating effect on children states that roughly over 50 percent of teenagers leave home to escape their parents' pressure. It shows us that doing unpaid work under pressure is not optional for both.\nIn conclusion, despite the requirement that some people think about unpaid work, I do believe that studying is also as unpaid work for teenagers and to avoid pressure is more important than making this compulsory."}
  ]
)



ChatCompletionMessage(content='5.5', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [50]:
# Load the dataset
file_path = 'C:/Users/EricZ/Documents/GitHub/nlp-project/data/dataset/processed/validation.csv'
data = pd.read_csv(file_path)

# Replace '<4' with an approximate numeric value (e.g., 3.5) for MSE calculation
data['band'] = data['band'].apply(lambda x: 3.5 if x == '<4' else float(x))

# Display the first few rows of the dataset
data.head()

Unnamed: 0,prompt,essay,band
0,In some countries more and more people are bec...,"In recent years, many people are concerned to ...",8.5
1,"In some countries, more and more people are be...","In the modern world, many people have increasi...",6.0
2,The best way to solve the world's environmenta...,"In recent years, there has been a debate on wh...",8.5
3,"In some countries, more and more people are be...",Houses are the place that people live which is...,4.0
4,Some people believe that eventually all jobs w...,While it is true that in many nations of peopl...,8.0


In [54]:
def predict_band_score(prompt, essay):
    # Prepare the chat completion request
    response = client.chat.completions.create(
        model="ft:gpt-4o-mini-2024-07-18:personal::ATH78GB6",
        messages=[
            {"role": "system", "content": "You are a IELTS writing part examiner who is responsible to provide a band score given an essay"},
            {"role": "user", "content": f"Question: {prompt} \nEssay: {essay}"}
        ]
    )
    
    # Extract the prediction from the response
    result = response.choices[0].message.content.strip()
    
    try:
        predicted_score = float(result)
        if predicted_score < 4.0:
            return 3.5  # Convert low scores to 3.5 for comparison
        return predicted_score
    except ValueError:
        print("Prediction error: unable to convert output to float.")
        return None

In [56]:
def evaluate_model(data):
    true_scores = []
    predicted_scores = []
    
    for _, row in data.iterrows():
        prompt, essay, true_score = row['prompt'], row['essay'], row['band']
        predicted_score = predict_band_score(prompt, essay)
        
        if predicted_score is not None:
            true_scores.append(true_score)
            predicted_scores.append(predicted_score)

    # Calculate MSE and accuracy
    mse = mean_squared_error(true_scores, predicted_scores)
    accuracy = sum([1 if ((t == p) or (t+0.5 == p) or (t-0.5 == p)) else 0 for t, p in zip(true_scores, predicted_scores)]) / len(true_scores)
    return mse, accuracy

# Run evaluation
mse, accuracy = evaluate_model(data)
print(f"Mean Squared Error: {mse}")
print(f"Accuracy (within 0.5 band): {accuracy * 100:.2f}%")

Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Prediction error: unable to convert output to float.
Mean Squared Error: 2.325892857142857
Accuracy (within 0.5 band): 41.96%
