In [None]:
# !pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
# !pip install peft wandb dataloader datasets huggingface_hub trl flash_attn bitsandbytes
# !pip install -U accelerate

# Fine-Tuning Setup for Models

This script prepares the environment for fine-tuning a causal language model using Hugging Face Transformers and PEFT (Parameter-Efficient Fine-Tuning).

### Key Libraries:
- **Transformers**: Model loading, tokenization, and training utilities.
- **PEFT**: Lightweight LoRA fine-tuning for large models.
- **TRL**: Chat model formatting for reinforcement learning and instruction tuning.
- **Datasets**: Efficient data loading and iteration.
- **Huggingface Hub**: Authentication for model and dataset uploads.
- **Weights & Biases**: Experiment tracking.

### Main Components:
- Load pretrained model and tokenizer.
- Apply LoRA configuration for efficient fine-tuning.
- Set up chat-friendly format if needed.
- Prepare training arguments and trainer for model fine-tuning.




In [None]:
import torch
import wandb
from datasets import IterableDataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
from trl import setup_chat_format
from huggingface_hub import login
from torch.utils.data import Dataset

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi


In [None]:
"""Make sure you have enough GPU memory to run this notebook. If you have 24GB of GPU memory, you can run the model with 4-bit quantization. If you have 48GB of GPU memory, you can run the model with 8-bit quantization. If you have 80GB of GPU memory, you can run the model with 16-bit quantization. If you have 128GB of GPU memory, you can run the model with 32-bit quantization."""

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        input_ids, labels, attention_mask = inputs[0]
        outputs = model(input_ids,labels=labels, attention_mask=attention_mask)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss
    
    def prediction_step(self, model, inputs, prediction_loss_only: bool, ignore_keys=None):
        input_ids, labels, attention_mask = inputs[0]

        with torch.no_grad():
            outputs = model(input_ids,labels=labels, attention_mask=attention_mask)
            logits = outputs.logits
            loss = outputs.loss
        return (loss, logits, labels)

In [None]:
class TextDatasetQA(Dataset):
    """
    Dataset class for question-answering tasks.
    This class is used to load and preprocess the dataset for training a model.
    It inherits from the PyTorch Dataset class.
    """
    def __init__(self, tokenizer, max_length=512):
        super(TextDatasetQA, self).__init__()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.forget_data = datasets.load_dataset('json', data_files='./Dataset/data_v2.json')['train']
        # self.forget_data = datasets.load_dataset('locuslab/TOFU', name='retain90', split='train')

    def __len__(self):
        return len(self.forget_data)

    def __getitem__(self, idx):
        rets = []
        torch.manual_seed(idx)  
        question = self.forget_data[idx]['question']
        answer = self.forget_data[idx]['answers']
        system_prompt = """
        You are a model which is optimized for NER Tagging. The input you receive should be processed word by word, returning the NER tag for each word. You have the following tags available:
        - 'O' for words that are outside of any named entity.
        - 'B-PER' for the beginning of a person's name.
        - 'I-PER' for subsequent words in a person's name.
        - 'B-ORG' for the beginning of an organization's name.
        - 'I-ORG' for subsequent words in an organization's name.
        - 'B-LOC' for the beginning of a location name.
        - 'I-LOC' for subsequent words in a location name.
        - 'B-MISC' for the beginning of a miscellaneous entity.
        - 'I-MISC' for subsequent words in a miscellaneous entity.
        Please return the NER tags for each word in the input you process.
        """

        pre_text = f"""<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"""
        post_text = f"""{answer}<|im_end|>\n"""
        full_text = pre_text + post_text

        non_predict = len(self.tokenizer.tokenize(pre_text, add_special_tokens=True))

        encoded = self.tokenizer(
            full_text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True
        )
        
        pad_length = self.max_length - len(encoded.input_ids)
        pad_input_ids = encoded['input_ids'] + [self.tokenizer.eos_token_id] * pad_length
        pad_attention_mask = encoded['attention_mask'] + [0] * pad_length
        if len(encoded.input_ids) == self.max_length:
            label = encoded.input_ids
        else:
            label = encoded['input_ids'] + [self.tokenizer.eos_token_id] + [-100] * (pad_length-1)

        for i in range(non_predict): 
            label[i] = -100

        rets.append((torch.tensor(pad_input_ids), torch.tensor(label), torch.tensor(pad_attention_mask)))

        return rets

In [None]:
def custom_data_collator(samples):
    """
    Custom data collator for the Trainer. It takes a list of samples and returns a batch of input_ids, labels, and attention_mask.
    """
    rets = []
    forget_samples = [sample[0] for sample in samples]
    input_ids = [s[0] for s in forget_samples]
    labels = [s[1] for s in forget_samples]
    attention_mask = [s[2] for s in forget_samples]

    rets.append((torch.stack(input_ids), torch.stack(labels), torch.stack(attention_mask)))
    return rets

In [None]:
login(token="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")
wb_token = "79a022c72b11379bc1b8f09a0d895512296303c9"

wandb.login(key=wb_token)
run = wandb.init(
    project="Run the specificModel", job_type="training", anonymous="allow"
)

In [None]:
# Define the model and folder names for saving the model


'''Get the model from huggingface hub'''
base_model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
new_model = "deepseek_r1_1.5b_pos"

In [None]:
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
    
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [None]:
"""Loading the model and tokenizer"""

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = None

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
model.config.use_cache = False

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
"""Some Errors were fixed here"""

model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
max_length = 500
torch_format_dataset = TextDatasetQA(tokenizer=tokenizer, max_length=max_length)
max_steps = int(7*len(torch_format_dataset))//(2*4*1)

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    num_train_epochs=7,
    eval_steps=0.2,
    logging_steps=5,
    warmup_steps=max_steps//10,
    max_steps=max_steps,
    logging_strategy="steps",
    learning_rate=1e-4,
    fp16=False,
    bf16=True,
    report_to="wandb"
)

In [None]:
def data_gen():
    for i in range(len(torch_format_dataset)):
        yield torch_format_dataset[i]
    
torch_format_dataset_it = IterableDataset.from_generator(data_gen)

trainer = CustomTrainer(
    model=model,
    train_dataset=torch_format_dataset_it,
    eval_dataset=torch_format_dataset_it,
    args=training_arguments,
    data_collator=custom_data_collator,
)

In [None]:
"""Train the model"""
trainer.train()
trainer.model.save_pretrained(new_model)
model.config.use_cache = True