In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import pipeline
from sklearn.model_selection import train_test_split
import tqdm

In [None]:
import torch
import gc
# Clear cache

# For debugging purposes, check memory stats
print(f"Memory Allocated: {torch.cuda.memory_allocated() / 1e6} MB")
print(f"Memory Reserved: {torch.cuda.memory_reserved() / 1e6} MB")
torch.cuda.empty_cache()
gc.collect()
torch.cuda.reset_peak_memory_stats()
model_path = "andrijdavid/Llama-3-1B-Base"
llama3_model_path = "meta-llama/Llama-3.2-1B"
file_path = "../data/combined_dataset.csv"
example_file_path = "../sample_data/llama3_sample_dataset.csv"
output_dir = "../results/checkpoint-2400"


def create_input_text(row):
    return (
        f"""You are a member of the IELTS essay evaluation committee.
        Your task is to evaluate the essay based on the given prompt and assign it a score
        between 4 and 9 (in 0.5 increments). There is also a '<4' class, thus a total of 12 classes.
        Provide only the score or '<4' as your response. Think step by step why this essay is good or bad.
        "Prompt: {row['prompt']}\nEssay: {row['essay']}"""
    )

def map_band_to_class(band):
    if band ==  "<4":
        return band_to_class['<4']
    return band_to_class[str(band)]

def tokenize_function(example):
    return tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

band_classes = ['<4'] + ['4', '4.5', '5', '5.5', '6', '6.5', '7', '7.5', '8', '8.5', '9'] 
band_to_class = {band: i for i, band in enumerate(band_classes)}  # Band → Class
class_to_band = {i: band for band, i in band_to_class.items()}  # Class → Band

# Map bands to classes
def map_band_to_class(band):
    if band ==  "<4":
        return band_to_class['<4']
    return band_to_class[str(band)]

def tokenize_function(example):
    return tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

Memory Allocated: 0.0 MB
Memory Reserved: 0.0 MB


### Prepare Test Dataset

#### Prepare Model and Tokenizer, Move them to CUDA.

In [18]:
data = pd.read_csv(example_file_path).head(3)
data['input_text'] = data.apply(create_input_text, axis=1)
data['labels'] = data['band'].apply(map_band_to_class)
# Drop unnecessary columns. Might need them later
data = data.drop(columns=["evaluation", "band", "prompt", "essay"])

# train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Prepare datasets for Hugging Face Trainer
from datasets import Dataset
# train_dataset = Dataset.from_pandas(train_data)
# test_dataset = Dataset.from_pandas(test_data)
test_dataset = Dataset.from_pandas(data)

# print(f"Train dataset length: {len(train_dataset)}, Test dataset length: {len(test_dataset)}")

tokenizer = AutoTokenizer.from_pretrained(llama3_model_path)
tokenizer.pad_token = tokenizer.eos_token
# train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
# print(f"train_dataset columns: {train_dataset.column_names}")
# print(f"train_dataset[0]: {train_dataset[0]}, train_dataset['labels'][0]: {train_dataset['labels'][0]}")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [14]:
print(f"Memory Allocated: {torch.cuda.memory_allocated() / 1e6} MB")
print(f"Memory Reserved: {torch.cuda.memory_reserved() / 1e6} MB")
torch.cuda.empty_cache()
gc.collect()

Memory Allocated: 0.0 MB
Memory Reserved: 0.0 MB


882

## Setting up Trainer

In [19]:
num_labels = 12  # Total number of unique band scores
# C:\Users\26597\.cache\huggingface\hub Models are here
model_path = "andrijdavid/Llama-3-1B-Base"
llama3_model_path = "meta-llama/Llama-3.2-1B"

model = AutoModelForSequenceClassification.from_pretrained(
    output_dir,
    num_labels=num_labels,
)
t = model.config.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Freeze the base model
for param in model.base_model.parameters():
    param.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Device: {device}")
# # Test the model
# prompt = "The key to life is"
# inputs = tokenizer(prompt, return_tensors="pt")
# outputs = model.generate(**inputs, max_length=30)

# Decode the generated text
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Device: cuda


In [25]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16=True,
    # per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    # num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    gradient_accumulation_steps=1,
    load_best_model_at_end=True,
    do_train=False,
    do_eval=True,
)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prediction_labels = [class_to_band[p] for p in preds]
    true_labels = [class_to_band[l] for l in labels]
    print(f"True Labels: {true_labels}")
    print(f"Predicted Labels: {prediction_labels}")
    print(f"Accuracy: {acc}")
    return {"accuracy": acc}
    
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


# # Fine-tune the model
# trainer.train()

# # Evaluate on the test dataset
# test_results = trainer.evaluate()
# print(f"Test Results: {test_results}")

predictions = trainer.predict(test_dataset)

  0%|          | 0/3 [00:00<?, ?it/s]

True Labels: ['6', '8.5', '8.5']
Predicted Labels: ['7', '5.5', '7']
Accuracy: 0.0
