# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd drive/MyDrive/LLMs/Fine-tuning/DPO

Mounted at /content/drive
/content/drive/MyDrive/LLMs/Fine-tuning/DPO


In [None]:
!pip install peft==0.5.0 -qqq
!pip install bitsandbytes==0.41.1 -qqq
!pip install safetensors>=0.3.1 -qqq
!pip install wandb -qqq
!pip install tokenizers>=0.13.3 -qqq
!pip install -U transformers -qqq
!pip install accelerate==0.21.0 -qqq
!pip install git+https://github.com/huggingface/trl -qqq

!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!pip install flash-attn --no-build-isolation

In [None]:
"""
This cell imports various libraries and modules to train a language model using the Hugging Face Transformers library.
"""

# Importing necessary libraries and modules
import os  # Module for interacting with the operating system
import warnings  # Module for managing warnings
from collections import defaultdict  # Default dictionary implementation
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union  # Type hinting for variables and functions

import torch  # PyTorch library for deep learning
import torch.nn as nn  # Neural network module of PyTorch
import torch.nn.functional as F  # Functional interface to PyTorch's functions
import datasets  # Library for managing datasets
from datasets import Dataset, load_dataset  # Dataset loading and management
import transformers  # Hugging Face's Transformers library
from transformers import (  # Various classes and functions from Transformers library
    AutoTokenizer,  # Auto tokenizer for model-specific tokenization
    AutoModelForCausalLM,  # Pre-trained model for causal language modeling
    DataCollator,  # Data collator for processing input data
    PreTrainedModel,  # Pre-trained model from Hugging Face's models
    PreTrainedTokenizerBase,  # Base class for tokenizers
    Trainer,  # Trainer class for training models
    TrainingArguments,  # Arguments for training the model
    DataCollatorForLanguageModeling,  # Data collator for language modeling
    BitsAndBytesConfig  # Configuration for handling bits and bytes
)

from transformers.trainer_callback import TrainerCallback  # Callback for Trainer class

import gc  # Garbage collection module for managing memory usage

import os  # Module for interacting with the operating system
from google.colab import runtime  # Google Colab runtime module for managing environment
import pandas as pd  # Data manipulation and analysis library

import accelerate  # Library for optimizing deep learning training
import bitsandbytes as bnb  # Bits and Bytes library for managing bits and bytes
import wandb  # Library for tracking and visualizing machine learning experiments
from peft import (  # Library for training and fine-tuning models
    LoraConfig,  # Configuration for Lora
    get_peft_model,  # Function for getting a PEFT model
    prepare_model_for_kbit_training,  # Function for preparing model for KBIT training
    PeftModel,  # PEFT model
    PeftConfig  # Configuration for PEFT
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM  # TRL library components
from datetime import datetime  # Date and time manipulation
from huggingface_hub import login  # Logging into Hugging Face's model hub

from peft.tuners.lora import LoraLayer  # LoraLayer for PEFT tuning

from tqdm import tqdm  # Progress bar module for tracking progress

import trl  # TRL library
from trl import DPOTrainer  # DPOTrainer for training
from trl.models import create_reference_model  # Creating a reference model
from trl.import_utils import is_peft_available  # Checking if PEFT is available
from trl.trainer.dpo_trainer import (  # DPO trainer components
    DPODataCollatorWithPadding,  # Data collator with padding for DPO training
    disable_dropout_in_model,  # Disabling dropout in the model
    pad_to_length  # Padding input data to a specified length
)

from huggingface_hub import login  # Logging into Hugging Face's model hub
from random import sample  # Sampling from a collection

from trl.trainer.dpo_trainer import (  # DPO trainer components (repeated import)
    DPODataCollatorWithPadding,  # Data collator with padding for DPO training
    disable_dropout_in_model,  # Disabling dropout in the model
    pad_to_length  # Padding input data to a specified length
)

In [None]:
from getpass import getpass  # Module for secure password input

# Prompting the user to input their Weights & Biases (wandb) token
wandb_token = getpass('input wandb token: ')

# Prompting the user to input their Hugging Face (hf) token
hf_token = getpass('input hf token: ')

# Form Dataset

## Definitions

In [None]:
#def chosen_rejected(example):
#    """
#    Given an example with answers and their scores, this function determines the chosen and rejected answers based on the scores.
#
#    Args:
#        example (dict): A dictionary containing 'answers.score' (a list of scores) and 'answers.text' (a list of corresponding answers).
#
#    Returns:
#        dict: A dictionary with 'chosen' and 'rejected' keys, representing the chosen and rejected answers respectively.
#    """
#    scores = example['answers.score']
#    answers = example['answers.text']
#
#    if scores[0] > scores[1]:
#        return {'chosen': answers[0], 'rejected': answers[1]}
#    else:
#        return {'chosen': answers[1], 'rejected': answers[0]}


def chosen_rejected(x):
    """
    Reformat a paired dataset to extract chosen and rejected answers along with their corresponding scores.

    Args:
        x (dict): A dictionary containing 'scores' (a list of scores) and 'answers' (a list of corresponding answers).

    Returns:
        dict: A dictionary with 'chosen', 'rejected', 'score_accepted', and 'score_rejected' keys.
    """
    if x['scores'][0] > x['scores'][1]:
        score_accepted = x['scores'][0]
        score_rejected = x['scores'][1]
        answer_accepted = x['answers'][0]
        answer_rejected = x['answers'][1]
    else:
        score_accepted = x['scores'][1]
        score_rejected = x['scores'][0]
        answer_accepted = x['answers'][1]
        answer_rejected = x['answers'][0]

    return {'chosen': answer_accepted,
            'rejected': answer_rejected,
            'score_accepted': score_accepted,
            'score_rejected': score_rejected}

def format_prompt(example):
    """
    Prepare the text for a sample of the dataset by formatting it into a conversation prompt.

    Args:
        example (dict): A dictionary containing 'title_body', which contains the question.

    Returns:
        str: Formatted text with the user's message and a placeholder for the assistant's reply.
    """
    text = f"### Human: {example['title_body']}\n ### Assistant: "
    return text


def reformat_dataset(ds, tokenizer):
    """
    Reformat dataset to perform preference modeling.

    Args:
        ds (datasets.Dataset): The dataset to be reformatted.
        tokenizer (Huggingface Tokenizer): The tokenizer used for tokenizing text.

    Returns:
        datasets.Dataset: The reformatted dataset.
    """
    # Step 1: Determine chosen and rejected answers based on scores
    ds = ds.map(lambda x: chosen_rejected(x))

    # Step 2: Remove unnecessary columns
    ds = ds.remove_columns(['answers.score', 'answers.text', 'title_body'])

    # Step 3: Calculate total length of chosen and rejected answers along with the prompt
    def tot_length(example):
        longer_answer = max(len(tokenizer(example[key])['input_ids']) for key in ['chosen', 'rejected'])
        tot_length = longer_answer + len(tokenizer(example['prompt'])['input_ids'])
        return tot_length

    ds = ds.map(lambda x: {'length': tot_length(x)})

    return ds

def choose_random_answers(example):
    """
    Randomly select two answers along with their scores from the examples.

    Args:
        example (dict): A dictionary containing 'answers.score' (a list of scores) and 'answers.text' (a list of corresponding answers).

    Returns:
        dict: A dictionary with randomly selected 'answers.score' and 'answers.text'.
    """
    scores = example['answers.score']
    answers = example['answers.text']

    # Randomly select two indices
    indices = sample(range(len(example['answers.score'])), 2)

    # Extract scores and answers based on the selected indices
    scores_samp = [scores[i] for i in indices]
    answers_samp = [answers[i] for i in indices]

    return {'answers.score': scores_samp, 'answers.text': answers_samp}


## Pairs of answers

In [None]:
def create_paired_dataset(model_id='meta-llama/Llama-2-7b-hf'):
    """
    Creates a paired dataset for training a language model with specified filtering criteria.

    Args:
        model_id (str, optional): Hugging Face model identifier. Defaults to 'meta-llama/Llama-2-7b-hf'.

    Returns:
        None
    """
    with wandb.init(project='DPO_training_dm',
                    entity='ft-llmmm',
                    job_type='download_data',
                    name='download_data') as run:
        # Initialize Weights & Biases run for downloading data

        # Download the RM dataset artifact.
        artifact = run.use_artifact('ft-llmmm/ELI5_analysis/ELI5_RM_non_toxic:latest',
                                    type='dataset')
        artifact_dir = artifact.download()  # Download dataset artifact

    ds_RM = datasets.load_from_disk(artifact_dir)  # Load dataset from disk
    features = list(ds_RM['train'].features)  # Get list of dataset features
    ds_RM = ds_RM.remove_columns([col for col in features if
                                col not in ['answers.score',
                                            'answers.text',
                                            'title_body']])  # Remove unnecessary columns

    ds_RM = ds_RM.map(lambda x: {'prompt':format_prompt(x)})  # Apply prompt formatting

    tokenizer = AutoTokenizer.from_pretrained(
            model_id
        )  # Initialize tokenizer for the specified model

    ds_RM_top_2 = ds_RM.map(lambda x: {'answers.score': x['answers.score'][:2],
                                    'answers.text':x['answers.text'][:2]})  # Extract top 2 answers
    ds_RM_top_2 = reformat_dataset(ds_RM_top_2,tokenizer)  # Reformat dataset

    ds_RM_contrast = ds_RM.map(lambda x: {'answers.score':[x['answers.score'][i] for i in [0,-1]],
                                    'answers.text':[x['answers.text'][i] for i in [0,-1]]})  # Extract first and last answers
    ds_RM_contrast = reformat_dataset(ds_RM_contrast,tokenizer)  # Reformat dataset

    ds_RM_random = ds_RM.map(choose_random_answers)  # Choose random answers
    ds_RM_random = reformat_dataset(ds_RM_random,tokenizer)  # Reformat dataset

    ds_RM_paired = {}  # Initialize dictionary for paired datasets
    ds_RM_paired['top_2'] = ds_RM_top_2  # Add top 2 dataset to paired dictionary
    ds_RM_paired['contrast'] = ds_RM_contrast  # Add contrast dataset to paired dictionary
    ds_RM_paired['random'] = ds_RM_random  # Add random dataset to paired dictionary

    ds_RM_paired_filt = {}  # Initialize dictionary for filtered paired datasets

    for key in ds_RM_paired:
        ds_RM_filt[key] = ds_RM_paired[key].filter(lambda x: x['length']<=1024)  # Filter by maximum length

        ds_RM_paired[key].save_to_disk(f'./data/ds_RM_{key}')  # Save paired dataset to disk
        ds_RM_filt[key].save_to_disk(f'./data/ds_RM_{key}_1024')  # Save filtered paired dataset with max length 1024 to disk


In [None]:
create_paired_dataset()

## Weighted Answers

In [None]:
def multiple_pairs_of_answers(x):
    """
    Pair answers along with their scores for the given example.
    Limits number of pairs to be at most 10 for a given question.

    Args:
        x (dict): A dictionary containing 'answers.score' (a list of scores), 'answers.text' (a list of corresponding answers), and 'prompt' (the prompt text).

    Returns:
        dict: A dictionary with paired scores, answers, and prompt text.
    """
    zipped = list(zip(x['answers.score'], x['answers.text']))
    zipped = list(combinations(zipped, 2))
    scores_paired = [(z[0][0], z[1][0]) for z in zipped]
    answers_paired = [(z[0][1], z[1][1]) for z in zipped]
    num_pairs = len(answers_paired)

    if num_pairs <= 10:
        return {'prompt': [x['prompt'] for _ in range(num_pairs)],
                'scores': scores_paired,
                'answers': answers_paired,
                'weight': [1. / len(answers_paired) for _ in range(num_pairs)]}
    else:
        return {'prompt': [x['prompt'] for _ in range(10)],
                'scores': random.sample(scores_paired, 10),
                'answers': random.sample(answers_paired, 10),
                'weight': [1. / 10 for _ in range(10)]}

In [None]:
ds_RM_multiple_pairs = ds_RM.map(multiple_pairs_of_answers)
for split in ['train','validation','test']:
    print(f'working on split {split}')
    length = len(ds_RM_multiple_pairs[split])
    ds_RM_multiple_pairs[split] = datasets.concatenate_datasets([Dataset.from_dict(ds_RM_multiple_pairs[split][i])\
                               for i in range(length)])

tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf',
                                          token = hf_token)
reformat_dataset(ds_RM_paired,tokenizer)

In [None]:
#ds_RM_multiple_pairs = (ds_RM.map(multiple_pairs_of_answers,
#                          remove_columns=['answers.score',
#                                          'answers.text',
#                                          'title_body']))

#def add_length_index(ds, tokenizer):
#    """
#    Add a 'lengths' field to each example in the dataset, representing the total length of chosen and rejected answers along with the prompt.
#
#    Args:
#        ds (datasets.Dataset): The dataset to which the 'lengths' field will be added.
#        tokenizer (transformers.AutoTokenizer): The tokenizer used for tokenizing text.
#
#    Returns:
#        datasets.Dataset: The modified dataset with the 'lengths' field added to each example.
#    """
#    def tot_length(example):
#        longer_answer = max(len(tokenizer(example[key])['input_ids']) for key in ['chosen', 'rejected'])
#        tot_length = longer_answer + len(tokenizer(example['prompt'])['input_ids'])
#        return tot_length
#
#    ds = ds.map(lambda x: {'lengths': tot_length(x)})
#    return ds

#tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf',
#                                          token = hf_token)
#
#ds_RM_paired = ds_RM_paired.map(lambda x:chosen_rejected(x))
#ds_RM_paired = ds_RM_paired.remove_columns(['scores','answers'])
#ds_RM_paired = add_length_index(ds_RM_paired,tokenizer)
#ds_RM_paired.save_to_disk('./data/ds_RM_paired')

# Filter by Toxicity

In this section we'll filter the dataset by toxicity so that toxic content always appears in the rejected column. This is a work in progress and more code will appear here later.

# Training


In [None]:
# The huggingface model_ids can get fairly long, se the following dictionary is used to shorten them.

model_name_simplifier = {}

model_name_simplifier['dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'] ='llama-7b-SFT-qlora-eli5-wiki'
model_name_simplifier['dhmeltzer/llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged']='llama-7b-SFT-qlora-eli5'
model_name_simplifier['dhmeltzer/llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged']='llama-7b-SFT-qlora-wiki'

## Single Pair of Answers

### No Margin

In [None]:
model_id = 'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'
model_name = model_name_simplifier[model_id]+'_DPO'

dataset_path = './data/ds_RM_top_2_1024'
optim = 'paged_adamw_8bit'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}_{ds_name}'

!python ./run_dpo.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 2 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--gradient_accumulation_steps 4 \
--optim {optim} \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 0 \
--auto_find_batch_size 0 \
--beta .2

In [None]:
model_id = model_id
model_name = model_name_simplifier[model_id]+'_DPO'

dataset_path = './data/ds_RM_contrast_1024'
optim = 'paged_adamw_8bit'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}_{ds_name}'

!python ./run_dpo.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 1 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--gradient_accumulation_steps 4 \
--optim {optim} \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 0 \
--auto_find_batch_size 0 \
--beta .2

In [None]:
from google.colab import runtime
runtime.unassign()

### Include Margin

## Multiple Pairs of Answers

### No Margin

### Margin

In [None]:
model_id = 'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'
#model_id = 'distilgpt2'

if model_id in model_name_simplifier:
    model_name = model_name_simplifier[model_id]+'_DPO'
else:
    model_name = model_id.split('/')[-1]

dataset_path = './data/ds_RM_paired'
optim = 'paged_adamw_8bit'
include_margin = 0

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}_{ds_name}'

if include_margin:
    run_name += '_margin'
    repo_id += '_margin'

!python ./run_dpo.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 0 \
--epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \
--optim {optim} \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 0 \
--logging_steps 10 \
--resume_from_checkpoint 0 \
--auto_find_batch_size 0 \
--beta .2 \
--rho 1. \
--include_margin {include_margin}

# Merging Weights

In [None]:
import torch
import peft
import json
import shutil
from peft.utils import _get_submodules
import os
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
from peft import PeftModel
from transformers import (AutoModelForCausalLM,
                          LlamaForCausalLM,
                          LlamaTokenizer,
                          BitsAndBytesConfig,
                          AutoTokenizer
)
import gc
import copy
from getpass import getpass

In [None]:
def dequantize_model(model, tokenizer, dtype=torch.bfloat16, device="cuda"):
    """
    Dequantizes a peftmodel that was trained with qlora using 4-bit quantization.

    Args:
        model (nn.Module): The peftmodel loaded with qlora.
        tokenizer: The corresponding Hugging Face's tokenizer for the model.
        dtype (torch.dtype, optional): Data type to use for dequantization. Default is torch.bfloat16.
        device (str, optional): Device to load the dequantized model. Default is "cuda".

    Returns:
        nn.Module: The dequantized model.
    """
    # Define the class for 4-bit quantization
    cls = bnb.nn.Linear4bit

    with torch.no_grad():
        for name, module in model.named_modules():
            # Check if the module is an instance of the 4-bit quantization class
            if isinstance(module, cls):
                print(f"Dequantizing `{name}`...")
                quant_state = copy.deepcopy(module.weight.quant_state)

                # Set the desired dtype for dequantization
                quant_state[2] = dtype

                # Dequantize the weights
                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

                # Create a new module with the dequantized weights
                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                new_module.weight = torch.nn.Parameter(weights)
                new_module.to(device=device, dtype=dtype)

                # Replace the original module with the dequantized one
                parent, target, target_name = _get_submodules(model, name)
                setattr(parent, target_name, new_module)

        # A hack to avoid Hugging Face's saving error, as it does not support saving a model registered for 4-bit loading.
        model.is_loaded_in_4bit = False
        return model

def merge_weights(base_model_id,
                  adapter_model_id,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda"):
    """
    Merges the weights of a base model and an adapter model, and pushes the merged model to the Hugging Face Model Hub.

    Args:
        base_model_id (str): Hugging Face model ID for the base model.
        adapter_model_id (str): Hugging Face model ID for the adapter model.
        hf_token (str): Hugging Face authentication token.
        dtype (torch.dtype, optional): Data type to use for quantization. Default is torch.bfloat16.
        device (str, optional): Device to load the model. Default is "cuda".

    Returns:
        None
    """
    # Create a unique repository ID for the merged model
    repo_id = adapter_model_id+'_merged'

    # Define quantization configuration
    quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )

    # Load the base model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config,
        device_map={"": 0},
        use_auth_token=hf_token
    )

    # Initialize tokenizer for the base model
    tok = AutoTokenizer.from_pretrained(base_model_id,
                                         use_auth_token=hf_token
                                        )

    # Dequantize the base model
    model = dequantize_model(model, tok)

    # Load the adapter model
    model = PeftModel.from_pretrained(model=model, model_id=adapter_model_id)

    # Merge and unload the models
    model = model.merge_and_unload()

    # Push the merged model and tokenizer to the Hugging Face Model Hub
    model.push_to_hub(repo_id, safe_serialization=True)
    tok.push_to_hub(repo_id)

In [None]:
adapter_models = [
    'dhmeltzer/llama-7b-SFT-qlora-eli5-wiki_DPO_ds_RM_contrast_1024_r_64_alpha_16',
    'dhmeltzer/llama-7b-SFT-qlora-eli5-wiki_DPO_ds_RM_top_2_1024_r_64_alpha_16',
]

base_model_id = 'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'

for adapter_model in adapter_models:
    merge_weights(base_model_id,
                  adapter_model,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda")

# Scratch

## older training

In [None]:
def DPO_training(model_id,
                ds_name,
                hf_token = None,
                wandb_token = None,
                gradient_checkpointing=True,
                r=64,
                lora_alpha=16,
                lora_dropout=0.1,
                beta=.1,
                bias='none',
                task_type='CAUSAL_LM',
                max_prompt_length=4096,
                max_length=4096,
                epochs = 1,
                max_steps = -1,
                lr=5e-4,
                weight_decay=.1,
                per_device_train_batch_size=16,
                per_device_eval_batch_size=32,
                gradient_accumulation_steps=8,
                optim='adamw_torch_fused',
                warmup_ratio=0.03,
                lr_scheduler_type='cosine',
                auto_find_batch_size = True,
                group_by_length=True,
                dataloader_num_workers=2,
                logging_steps=10,
                save_total_limit=3,
                save_strategy='steps',
                save_steps =.1,
                eval_steps=.1,
                load_best_model_at_end=True,
                project_name='DPO_training_dm',
                entity='ft-llmmm',
                torch_compile=False,
                length_column_name='lengths',
                truncation_mode='keep_start',
                repo_id = None,
                output_dir = None,
                hub_strategy = 'every_save'):

    if torch.cuda.get_device_capability()[0] == 8:
        bf16=True,
        fp16=False
    else:
        bf16=False
        fp16=True


    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

    model = AutoModelForCausalLM.from_pretrained(
            model_id,
            use_cache=False
            if gradient_checkpointing
            else True,  # this is needed for gradient checkpointing
            device_map="auto",
            quantization_config=bnb_config
        )

    model.train()

    model = create_peft_model(model,
                            r=r,
                            lora_alpha=lora_alpha,
                            lora_dropout=lora_dropout,
                            bias=bias,
                            task_type=task_type,
                            gradient_checkpointing=gradient_checkpointing,
                            bf16=bf16)

    tokenizer = AutoTokenizer.from_pretrained(
            model_id,
        )

    tokenizer.pad_token = tokenizer.eos_token

    model_name = model_id.split('/')[-1]

    if output_dir is None:
        output_dir = f'./{model_name}_DPO_{ds_name}_r_{r}_alpha_{lora_alpha}'

    if wandb_token:
        wandb.login(key=wandb_token)

        wandb.init(
            job_type='training',
            project=project_name,
            entity=entity,
            name = repo_id
            )

    training_args = TrainingArguments(
        logging_dir =output_dir+'./logs',
        output_dir= output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        bf16=bf16,  # Use BF16 if available
        fp16=fp16,
        learning_rate=lr,
        num_train_epochs=epochs,
        max_steps = max_steps,
        gradient_checkpointing=gradient_checkpointing,
        optim=optim,
        warmup_ratio=warmup_ratio,
        weight_decay = weight_decay,
        gradient_accumulation_steps=gradient_accumulation_steps,
        group_by_length=group_by_length,
        # logging strategies
        logging_strategy="steps",
        logging_steps=logging_steps,
        save_strategy=save_strategy,
        evaluation_strategy = save_strategy,
        save_steps = save_steps,
        eval_steps = eval_steps,
        lr_scheduler_type=lr_scheduler_type,
    #   log_level = 'error',
        hub_token=hf_token,
        report_to='wandb' if wandb_token else None,
        dataloader_num_workers = dataloader_num_workers,
        load_best_model_at_end=load_best_model_at_end,
        save_total_limit = save_total_limit,
        remove_unused_columns=False,
        disable_tqdm=False,
        torch_compile=torch_compile,
        length_column_name=length_column_name,
        auto_find_batch_size=auto_find_batch_size,
        push_to_hub = True if repo_id else False,
        hub_strategy=hub_strategy,
        #max_grad_norm=0.3,
        hub_model_id=repo_id
        #max_grad_norm=0.3
    )

    dataset = ds_dict['ds_name']
    train_dataset = dataset['train']
    eval_dataset = dataset['validation']

    dpo_trainer = DPOTrainer(
        model,
        args=training_args,
        beta=beta,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        max_prompt_length=max_prompt_length,
        max_length=max_length,
        truncation_mode=truncation_mode
    )

    original_performance = dpo_trainer.evaluate()
    wandb.log({'initial-performance': wandb.Table(dataframe=pd.DataFrame(original_performance, index=["Performance"]))})

    dpo_trainer.train()

    if repo_id:
        eval_result = dpo_trainer.evaluate()
        dpo_trainer.create_model_card(model_name=repo_id)
        dpo_trainer.push_to_hub()

    #final_performance = dpo_trainer.evaluate()
    #run.log({'final-performance': wandb.Table(dataframe=pd.DataFrame(final_performance, index=["Performance"]))})

    dpo_trainer.save_model(output_dir)

In [None]:
model_id = 'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'
dataset = ds_RM_top_2_filt[1024]
epochs = 1
optim = 'paged_adamw_8bit'
per_device_train_batch_size=32
per_device_eval_batch_size = 32
gradient_accumulation_steps=4

DPO_training(model_id,
            dataset,
            hf_token = hf_token,
            wandb_token = wandb_token,
            epochs = epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            auto_find_batch_size=False,
            repo_id = 'dhmeltzer/llama-7b-SFT-eli5wiki1024-DPO_top2-1024-r64-alpha16')

Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

Found 7 modules to quantize: ['k_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj', 'v_proj', 'down_proj']
trainable params: 159,907,840 || all params: 6,898,323,456 || trainable%: 2.3180681656919973


Downloading (…)okenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdmeltzer[0m ([33mft-llmmm[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
19,0.6887,0.682096,-0.375533,-0.47066,0.539983,0.095127,-202.567566,-206.854294,0.130077,0.156852
38,0.6852,0.683311,-0.408925,-0.54935,0.552083,0.140424,-203.354477,-207.188248,-0.001801,0.031421
57,0.6899,0.680961,-0.059625,-0.166149,0.566604,0.106524,-199.522461,-203.695221,0.033546,0.075674
76,0.6638,0.67197,-0.325149,-0.486091,0.575126,0.160942,-202.721878,-206.350449,0.030284,0.073753
95,0.6768,0.668999,0.063126,-0.058047,0.590173,0.121173,-198.441437,-202.467697,0.033054,0.084962
114,0.676,0.669399,-0.133009,-0.261908,0.594276,0.128899,-200.480042,-204.429077,0.054214,0.106292
133,0.6703,0.666981,-0.168998,-0.291161,0.593224,0.122163,-200.772583,-204.788971,0.056855,0.109225
152,0.6812,0.664921,-0.118525,-0.249208,0.595013,0.130684,-200.353043,-204.28421,0.047949,0.101074
171,0.6808,0.664863,-0.112347,-0.244159,0.588594,0.131813,-200.302597,-204.222443,0.045132,0.098196




Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.22k [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/320M [00:00<?, ?B/s]

In [None]:
gc.enable()
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_id = 'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'
dataset = ds_RM_contrast_filt[1024]
epochs = 1
optim = 'paged_adamw_8bit'
per_device_train_batch_size=32
per_device_eval_batch_size = 32
gradient_accumulation_steps=4

DPO_training(model_id,
            dataset,
            hf_token = hf_token,
            wandb_token = wandb_token,
            epochs = epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            auto_find_batch_size=False,
            repo_id = 'dhmeltzer/llama-7b-SFT-eli5wiki1024-DPO_contrast-1024-r64-alpha16')

gc.collect()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Found 7 modules to quantize: ['k_proj', 'gate_proj', 'o_proj', 'up_proj', 'q_proj', 'v_proj', 'down_proj']
trainable params: 159,907,840 || all params: 6,898,323,456 || trainable%: 2.3180681656919973




0,1
eval/logits/chosen,██▁▃▃▄▅▅▅▅▅
eval/logits/rejected,▇█▁▃▃▃▄▄▄▃▃
eval/logps/chosen,▇▁▁▆▂█▅▅▅▅▅
eval/logps/rejected,█▂▁▆▂▇▅▄▅▅▅
eval/loss,█▅▆▅▃▂▂▂▁▁▁
eval/rewards/accuracies,▁▇▇████████
eval/rewards/chosen,▇▁▁▆▂█▅▅▅▅▅
eval/rewards/margins,▁▅▇▆█▆▇▆▇▇▇
eval/rewards/rejected,█▂▁▆▂▇▅▄▅▅▅
eval/runtime,█▁▂▁▂▁▂▂▂▁▂

0,1
eval/logits/chosen,0.0982
eval/logits/rejected,0.04513
eval/logps/chosen,-204.22244
eval/logps/rejected,-200.3026
eval/loss,0.66486
eval/rewards/accuracies,0.58859
eval/rewards/chosen,-0.11235
eval/rewards/margins,0.13181
eval/rewards/rejected,-0.24416
eval/runtime,153.5012


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669247816662392, max=1.0…

KeyboardInterrupt: ignored

In [None]:
model_id = 'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'
dataset = ds_RM_random_filt[1024]
epochs = 1
optim = 'paged_adamw_8bit'
per_device_train_batch_size=32
per_device_eval_batch_size = 32
gradient_accumulation_steps=4

DPO_training(model_id,
            dataset,
            hf_token = hf_token,
            wandb_token = wandb_token,
            epochs = epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            auto_find_batch_size=False,
            repo_id = 'dhmeltzer/llama-7b-SFT-eli5wiki1024-DPO_random-1024-r64-alpha16')

gc.collect()
torch.cuda.empty_cache()

In [None]:
gradient_checkpointing=True
r=64
lora_alpha=16
lora_dropout=0.1
bias='none'
task_type='CAUSAL_LM'
max_seq_length=512
epochs = 1
max_steps = -1
lr=2e-4
weight_decay=.01
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps=1
optim='paged_adamw_32bit'
warmup_ratio=0.03
group_by_length=True
dataloader_num_workers=2
logging_steps=10
save_total_limit=3
save_strategy='steps'
save_steps =.2
eval_steps=.2
load_best_model_at_end=True
project_name='DPO_training_dm'
entity='ft-llmmm'
torch_compile=False
length_column_name='lengths'

SFT_model_id = 'dhmeltzer/Llama-2-7b-hf-wiki-no-gl-r-64-alpha-16-full'
base_model_id = 'meta-llama/Llama-2-7b-hf'

#SFT_model_id = 'distilgpt2'
#base_model_id = SFT_model_id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
        SFT_model_id,
        use_cache=False
        if gradient_checkpointing
        else True,  # this is needed for gradient checkpointing
        device_map="auto",
        quantization_config=bnb_config,
        #use_auth_token=hf_token
    )

model = create_peft_model(model,
                          r=r,
                          lora_alpha=lora_alpha,
                          lora_dropout=lora_dropout,
                          bias=bias,
                          task_type=task_type,
                          gradient_checkpointing=gradient_checkpointing,
                          bf16=bf16)

model.train()

tokenizer = AutoTokenizer.from_pretrained(
        base_model_id,
        #use_auth_token=hf_token
    )

tokenizer.pad_token = tokenizer.eos_token



In [None]:
output_dir = f'./SFT_wiki_no_gl_DPO/models'



train_dataset = ds_RM_top_2['train']
eval_dataset = ds_RM_top_2['validation']

In [None]:
del DataCollator
del dpo_trainer

In [None]:
gc.collect()
torch.cuda.empty_cache()