In [2]:
import torch
import json
import numpy as np
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Cache
from transformers import Trainer
from transformers import TrainingArguments
from peft import get_peft_model
from peft import LoraConfig
from peft import TaskType
from classification import ClassificationWrapper
from dataloader import BiasDataset, custom_collate_fn

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_dir = '/mnt/e/NTU-DLWeek2025/model_scripts/datasets/clean_with_scores.json'
model_name='/mnt/e/NTU-DLWeek2025/Llama-encoder-1.0B'
output_dir='./model_scripts/finetune_results'
num_train_epochs=50
batch_size=4
split_ratio=0.9

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 1 for bias and nonbias classes
model.classifier = torch.nn.Linear(model.config.hidden_size, model.config.num_labels)
model.classifier.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
model.classifier.bias.data.zero_()

model.config.pad_token_id = model.config.eos_token_id
model.config.use_cache = True  # Ensure caching is enabled
model.config.cache_class = Cache  # Explicitly use the new cache class

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding
tokenizer.pad_token_id = tokenizer.eos_token_id  # Ensure ID is correctly set

# Apply LoRA using PEFT
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # Adjust based on your model
)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = model.config.eos_token_id

# Defining training configs
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    save_steps=len(dataset),
    save_total_limit=2,
    logging_dir=f'{output_dir}/logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    eval_steps=None,
    label_names=["labels"],  # Explicitly define label names
)

model.config.use_cache = False


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /mnt/e/NTU-DLWeek2025/Llama-encoder-1.0B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [None]:
# Defining dataset split and dataloaders
dataset = BiasDataset(data_dir, tokenizer, max_length=512)

eval_len = int(max(1, (1 - split_ratio) * len(dataset)))
train_data, eval_data = random_split(dataset, [len(dataset.data) - eval_len, eval_len], generator=torch.Generator())

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
eval_dataloader = DataLoader(eval_data, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)


In [11]:
import classification
from importlib import reload
reload(classification)
from classification import ClassificationWrapper

model = ClassificationWrapper(model)

# Defining trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader.dataset,
    eval_dataset=eval_dataloader.dataset,  # Pass eval dataset
    # tokenizer=tokenizer,
    data_collator=custom_collate_fn
)

trainer.train()

labels! yay! tensor([0.4000, 0.6000])
labels! yay! tensor([0.6200, 0.3800])
labels! yay! tensor([0.5300, 0.4700])
labels! yay! tensor([0.6100, 0.3900])


KeyError: 'input_ids'

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)