In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, PreTrainedTokenizer
from trl import SFTTrainer
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd

In [2]:
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def tokenize_data(dataframe, tokenizer: PreTrainedTokenizer, max_seq_length: int, device: str):
    tokenized_data = []
    for _, row in dataframe.iterrows():
        question = row['Question']
        answer = row['Answer']

        try:
            # Tokenize the question
            inputs = tokenizer(
                question,
                padding='max_length',
                truncation=True,
                max_length=max_seq_length,
                return_tensors="pt"
            )

            # Tokenize the answer as labels
            labels = tokenizer(
                f"{question}: {answer}",
                padding='max_length',
                truncation=True,
                max_length=max_seq_length,
                return_tensors="pt"
            )['input_ids'].squeeze(0)  # shape: [max_seq_length]

            tokenized_data.append({
                'input_ids': inputs['input_ids'].squeeze(0).to(device),
                'attention_mask': inputs['attention_mask'].squeeze(0).to(device),
                'labels': labels.to(device)
            })
        except Exception as e:
            print(f"Error tokenizing: {e}")
            continue

    return tokenized_data

In [3]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

max_seq_length = 512
device = "mps"
dataframe = pd.read_csv('dataset.csv')
tokenized_data = tokenize_data(dataframe, tokenizer, max_seq_length, device)
qa_dataset = QADataset(tokenized_data)

Error tokenizing: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


In [4]:
training_args = TrainingArguments(
    output_dir="./output",                 # Output directory
    save_strategy="steps",               # Save checkpoints during training
    save_steps=500,                        # Save checkpoint every 500 steps
    save_total_limit=2,                    # Save up to 2 checkpoints
    per_device_train_batch_size=8,         # Batch size for training
    per_device_eval_batch_size=8,          # Batch size for evaluation
    gradient_accumulation_steps=4,         # Accumulate gradients
    num_train_epochs=3,                    # Number of epochs
    learning_rate=5e-5,                    # Learning rate
    warmup_steps=500,                      # Warm-up steps
    weight_decay=0.01,                     # Weight decay
    fp16=torch.cuda.is_available(),        # Use mixed precision training if GPU is available
    push_to_hub=False,                      # Push model to the Hugging Face Hub if set to True
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=qa_dataset,
    tokenizer=tokenizer
)


  trainer = SFTTrainer(


In [5]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/459 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Model fine-tuned and saved to ./fine_tuned_model")