# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/LLMs/Fine-tuning/SFT

# installations

!pip install bitsandbytes==0.41.1
!pip install safetensors>=0.3.1
!pip install trl
!pip install wandb
!pip install tokenizers>=0.13.3
!pip install accelerate==0.21.0
!pip install datasets
!pip install -U torch
!pip install evaluate
!pip install rouge_score
!pip install nltk
!pip install bert_score

!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/transformers.git


!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!pip install flash-attn --no-build-isolation

In [None]:
# Import necessary libraries and modules
import gc  # Garbage collector module for memory management
import os  # Operating system module for file operations
import torch  # PyTorch library for deep learning
from google.colab import runtime  # Colab runtime module for environment management
import pandas as pd  # Pandas library for data manipulation

import datasets  # Huggingface datasets library for easy access to datasets
import accelerate  # Huggingface accelerate for training acceleration
import transformers  # Huggingface transformers for pre-trained models and training pipelines
from transformers import (  # Specific transformers components needed for the task
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig, TrainerCallback)
import bitsandbytes as bnb  # Custom library for bits and bytes quantization
import wandb  # Weights and Biases library for experiment tracking
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training  # PEFT library components
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM  # TRL library components
from datetime import datetime  # Standard Python library for date and time operations
from huggingface_hub import login  # Huggingface Hub login for model uploads and management

from peft.tuners.lora import LoraLayer  # LORA tuner for quantization
import evaluate  # Custom module for evaluation (not specified in the provided code)

In [None]:
# Import the 'getpass' function to securely input sensitive information
from getpass import getpass

# Prompt user to input their Hugging Face token (hidden input)
hf_token = getpass()

# Prompt user to input their Weights and Biases (wandb) token (hidden input)
wandb_token = getpass()

# Login to Hugging Face using the provided token
login(hf_token)

# Login to Weights and Biases (wandb) using the provided token
wandb.login(key=wandb_token)

··········
··········
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mdmeltzer[0m ([33mft-llmmm[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Definitions

In [None]:
# Function to format prompts for question-answer pairs
def formatting_prompts_func(example):
    """
    Formats prompts for question-answer pairs.

    Args:
        example (dict): A dictionary containing 'question' and 'answer' keys.

    Returns:
        list: A list of formatted prompts, each containing a human question and assistant answer.
    """
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Human: {example['question'][i]}\n ### Assistant: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

# Function to create a collator for SFT
def sft_collator(tokenizer, response_template = " ### Assistant:"):
    """
    Create a collator for SFT (supervised fine-tuning).

    Args:
        tokenizer (transformers.Tokenizer): A tokenizer for tokenizing text.
        response_template (str, optional): Template for assistant responses. Default is " ### Assistant:".

    Returns:
        DataCollatorForCompletionOnlyLM: A data collator for SFT training.
    """
    return DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

# Function to combine question and answer in the dataset
def combine_question_answer(ds, formatting_func):
    """
    Combine question and answer in the dataset.

    Args:
        ds (datasets.Dataset): Input dataset containing 'question' and 'answer' fields.
        formatting_func (function): A formatting function to apply to the dataset.

    Returns:
        datasets.Dataset: Updated dataset with a new field 'QA' containing formatted prompts.
    """
    if 'QA' not in ds['train']:
        ds = ds.map(lambda x: {'QA':formatting_func(x)}, batched=True)
    return ds

# Function to prepare the dataset for training by combining question/answer into one column.
# filter resulting columns so number of tokens is less than max_seq_length.
def prepare_dataset(ds, tokenizer, formatting_func, max_seq_length='auto'):
    """
    Prepare the dataset for training.

    Args:
        ds (datasets.Dataset): Input dataset.
        tokenizer (transformers.Tokenizer): A tokenizer for tokenizing text.
        formatting_func (function): A formatting function to apply to the dataset.
        max_seq_length (int or 'auto', optional): Maximum sequence length. Default is 'auto'.

    Returns:
        datasets.Dataset: Prepared dataset with 'tokens' field.
    """
    if max_seq_length == 'auto':
        max_seq_length = tokenizer.model_max_length

    ds = combine_question_answer(ds, formatting_func)
    ds = ds.map(lambda x: {'tokens':tokenizer(x['QA'], return_length=False)})
    ds = ds.filter(lambda x: len(x['tokens']['input_ids'])<=max_seq_length)

    return ds

# Detoxify and Combine Datasets

## Download datasets

In [None]:
def download_datasets():
    """
    Downloads and processes datasets for Simple Wikipedia QA and ELI5 analysis.

    Returns:
        tuple: A tuple containing two datasets - Simple Wikipedia QA dataset and ELI5 dataset.
    """
    # Initialize a Weights and Biases run for tracking and logging.
    with wandb.init(project='ELI5_analysis',
                    entity='ft-llmmm',
                    job_type='training',
                    name='SFT_training') as run:

        # Download the Simple Wikipedia QA dataset artifact.
        artifact_wiki_QA = run.use_artifact('ft-llmmm/ELI5_analysis/simple_wiki_QA:latest',
                                            type='dataset')
        artifact_dir_wiki_QA = artifact_wiki_QA.download()

        # Download the ELI5 dataset artifact.
        artifact_ELI5 = run.use_artifact('ft-llmmm/ELI5_analysis/ELI5_cleaned:latest',
                                         type='dataset')
        artifact_dir_ELI5 = artifact_ELI5.download()

    # Load the Simple Wikipedia QA dataset from CSV files.
    simplewiki_QA_ds = datasets.load_dataset("csv",
                                         data_files={"train": artifact_dir_wiki_QA + '/simple_wiki_QA_combined_train.csv',
                                                    "test": artifact_dir_wiki_QA +  '/simple_wiki_QA_combined_test.csv',
                                                    "val": artifact_dir_wiki_QA + '/simple_wiki_QA_combined_validation.csv'
                                        }
                                             )
    # Preprocess Simple Wikipedia QA dataset.
    simplewiki_QA_ds = simplewiki_QA_ds.remove_columns(['id','system_message','prompt_template'])
    simplewiki_QA_ds = simplewiki_QA_ds.rename_columns({'trunc_text':'answer'})

    # Rename and organize columns in the dataset.
    simplewiki_QA_ds['validation'] = simplewiki_QA_ds['val']
    del simplewiki_QA_ds['val']

    for split in simplewiki_QA_ds:
        # Add a source column indicating the dataset origin.
        dset_source = datasets.Dataset.from_dict({'source':['simple_wiki']*len(simplewiki_QA_ds[split])})
        simplewiki_QA_ds[split] = datasets.concatenate_datasets([simplewiki_QA_ds[split],dset_source],axis=1)

    # Load and process the ELI5 dataset from disk.
    ELI5_ds = datasets.load_from_disk(f'{artifact_dir_ELI5}/ds_SFT')
    ELI5_ds = ELI5_ds.flatten()
    ELI5_ds = ELI5_ds.remove_columns(['document','q_id','title','selftext','subreddit','url','title_urls','selftext_urls','answers_urls','pref_idxs','dupl_scores_idxs','qu_emb',
                                    'answers.a_id','answers.fkg','answers.fre','answers.score'])
    ELI5_ds = ELI5_ds.map(lambda x: {'answers.text':list(x['answers.text'])})

    # Transform the dataset format to pandas and then explode the 'answers.text' column.
    ELI5_ds = ELI5_ds.with_format("pandas").map(lambda df:
                                                    df.explode("answers.text"),
                                                    batched=True)

    # Convert the dataset format back to its original format.
    ELI5_ds = ELI5_ds.with_format(None)

    # Clean up columns and rename them for consistency.
    ELI5_ds = ELI5_ds.remove_columns(['__index_level_0__'])
    ELI5_ds = ELI5_ds.rename_columns({'answers.text':'answer',
                                    'title_body':'question'})

    for split in ELI5_ds:
        # Add a source column indicating the dataset origin.
        dset_source = datasets.Dataset.from_dict({'source':['ELI5']*len(ELI5_ds[split])})
        ELI5_ds[split] = datasets.concatenate_datasets([ELI5_ds[split],dset_source],axis=1)

    return simplewiki_QA_ds, ELI5_ds


## Detoxify ELI5

In [None]:
!pip install detoxify
#!pip install -U torch
#!pip install -U transformers

def filter_toxicity(ELI5_ds,
                     cutoff=0.1,
                     batch_size=64,
                     output_path='./data/ELI5_non_toxic'):
    """
    Filters toxic answers from ELI5 dataset using Detoxify model.

    Args:
        ELI5_ds (datasets.Dataset): The ELI5 dataset containing QA pairs.
        cutoff (float, optional): Cutoff value for toxicity. Answers with scores below this value are considered non-toxic. Default is 0.1.
        batch_size (int, optional): Batch size for processing. Default is 64.
        output_path (str, optional): Output path to save non-toxic ELI5 dataset. Default is './data/ELI5_non_toxic'.

    Returns:
        None
    """
    # Import the Detoxify library
    from detoxify import Detoxify

    # Check if torch is available on the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize the Detoxify model
    detoxify_model = Detoxify('unbiased')
    detoxify_model.model.to(device)

    # Predict toxicity scores for each answer in the dataset
    ELI5_ds = ELI5_ds.map(lambda x: detoxify_model.predict(x['answer']),
                                                  batched=True,
                          batch_size=batch_size
                      )

    # Define toxicity metrics
    metrics=['toxicity', 'severe_toxicity',
         'obscene', 'identity_attack',
         'insult', 'threat', 'sexual_explicit']

    # Filter out non-toxic answers based on the cutoff value
    ELI5_non_toxic = ELI5_ds.filter(lambda x: all(x[metric] <= cutoff
                                                for metric in metrics))

    # Remove unnecessary columns
    ELI5_non_toxic = ELI5_non_toxic.remove_columns([col for col in ELI5_non_toxic['train'].features if
                                                    col not in ['answer', 'question']])

    # Save the non-toxic ELI5 dataset to the specified output path
    ELI5_non_toxic.save_to_disk(output_path)


## Combine Datasets

In [None]:
#simplewiki_QA_ds
#ELI5_non_toxic = datasets.load_from_disk('./data/ELI5_non_toxic')

In [None]:
def combine_datasets(wiki_ds,
                     ELI5_ds,
                     output_path='./data/SFT_QA_ds',
                     artifact_name='combined_dataset',
                     seed=12321):
    """
    Combines datasets from Simple Wikipedia and ELI5.

    Args:
        wiki_ds (datasets.DatasetDict): Dataset for Simple Wikipedia QA pairs.
        ELI5_ds (datasets.DatasetDict): Dataset for ELI5 analysis QA pairs.
        output_path (str, optional): Output path to save the combined dataset. Default is './data/SFT_QA_ds'.
        artifact_name (str, optional): Name for the generated artifact. Default is 'combined_dataset'.
        seed (int, optional): Seed for shuffling the dataset. Default is 12321.

    Returns:
        None
    """
    # Initialize a DatasetDict to store the combined dataset
    SFT_QA_dataset = datasets.DatasetDict()

    # Combine datasets for 'train', 'validation', and 'test' splits
    for split in ['train', 'validation', 'test']:
        SFT_QA_dataset[split] = datasets.concatenate_datasets([wiki_ds[split],
                                                              ELI5_ds[split]])

    # Shuffle the combined dataset
    SFT_QA_dataset = SFT_QA_dataset.shuffle(seed=seed)

    # Combine question and answer into a single column
    SFT_QA_dataset = combine_question_answer(SFT_QA_dataset, formatting_prompts_func)

    # Remove unnecessary column 'Unnamed: 0'
    SFT_QA_dataset = SFT_QA_dataset.remove_columns('Unnamed: 0')

    # Save the combined dataset to the specified output path
    SFT_QA_dataset.save_to_disk(output_path)

    # Log the combined dataset as a Weights and Biases artifact
    now = datetime.now()
    time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")
    with wandb.init(project='ELI5_analysis',
                    entity='ft-llmmm',
                    job_type='upload_data',
                    name=f'SFT_QA_dataset_{time_stamp}') as run:

        clean_data_art = wandb.Artifact(artifact_name, 'dataset')
        clean_data_art.add_dir(output_path)
        run.log_artifact(clean_data_art)


In [None]:
# Run the above functions to download data, remove toxic content,
# and then combine datasets.

simple_wiki_ds, ELI5_ds = download_datasets()
ELI5_ds = filter_toxicity(ELI5_ds)
combine_datasets(wiki_ds,
                 ELI5_ds)

## Tokenizing

In [None]:
SFT_QA_dataset = datasets.load_from_disk('./data/SFT_QA_ds')

In [None]:
def tokenize_and_filter_dataset(ds,
                                model_id="meta-llama/Llama-2-7b-hf",
                                output_path='./data/SFT_QA_dataset_llama',
                                artifact_name='llama_QA_tokenized'):
    """
    Tokenizes and filters a dataset using a specified Hugging Face model.

    Args:
        ds (datasets.Dataset): The input dataset containing 'QA' field.
        model_id (str, optional): Hugging Face model ID for tokenization. Default is "meta-llama/Llama-2-7b-hf".
        output_path (str, optional): Output path to save the tokenized dataset. Default is './data/SFT_QA_dataset_llama'.
        artifact_name (str, optional): Name for the generated artifact. Default is 'llama_QA_tokenized'.

    Returns:
        None
    """
    # Extract the model name from the provided model_id
    model_name = model_id.split('/')[-1]

    # Initialize tokenizer from Hugging Face model
    tok = AutoTokenizer.from_pretrained(model_id)
    tok.add_special_tokens({'pad_token': '[PAD]'})

    # Tokenize the dataset and add a field for length of input_ids
    tok_ds = ds.map(lambda x: tok(x['QA']))
    tok_ds = tok_ds.map(lambda x: {'length': len(x['input_ids'])})

    # Filter out examples where length of input_ids is less than or equal to 1024
    tok_ds_1024 = tok_ds.filter(lambda x: x['length'] <= 1024)

    # Save the tokenized dataset to disk
    tok_ds.save_to_disk(output_path)
    tok_ds_1024.save_to_disk(f'{output_path}_1024')

    # Log the tokenized datasets as Weights and Biases artifacts
    with wandb.init(project='ELI5_analysis',
                entity='ft-llmmm',
                job_type='upload_data',
                name=f'llama_QA_tokenized_dataset_clean') as run:

        # Log the full tokenized dataset as an artifact
        clean_data_art = wandb.Artifact(artifact_name, 'dataset')
        clean_data_art.add_dir(output_path)
        run.log_artifact(clean_data_art)

        # Log the filtered dataset with max length of 1024 as an artifact
        clean_data_art_1024 = wandb.Artifact(artifact_name+'_1024', 'dataset')
        clean_data_art_1024.add_dir(output_path+'_1024')
        run.log_artifact(clean_data_art_1024)

In [None]:
SFT_QA_dataset = datasets.load_from_disk('./data/SFT_QA_ds')

tokenize_and_filter_dataset(SFT_QA_dataset)

# Training Experiments

## Original ELI5 + ELI5-wiki

### Redownload Data

Below we download the dataset artifacts to be used for training experiments. We also split the dataset into the ELI5 and Simple Wikipedia subsets again so that we can train on these datasets separately.

In [None]:
import wandb
run = wandb.init(project='SFT_training_dm',
                 entity='ft-llmmm')

artifact = run.use_artifact(
    'ft-llmmm/ELI5_analysis/llama_QA_tokenized_1024:v1',
    type='dataset')
artifact_dir = artifact.download()

ds_full = datasets.load_from_disk(
    './artifacts/llama_QA_tokenized_1024:v1')

ds_wiki_1024_full = ds_full.filter(
    lambda x: x['source'] == 'simple_wiki')

ds_eli5_1024 = ds_full.filter(
    lambda x: x['source'] != 'simple_wiki')

ds_wiki_1024_full.save_to_disk('./data/ds_wiki_1024_full')
ds_eli5_1024.save_to_disk('./data/ds_eli5_1024')

The rest of this subsection is a collection of cells where we run various training experiments by calling the run_clm.py function.

### Training Cells

In [None]:
model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
dataset_path = './artifacts/llama_QA_tokenized_1024:v1'
ds_name = 'eli5-wiki-1024'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
model_name = model_id.split('/')[-1]
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

#ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}-test'

In [None]:
!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 3 \
--max_steps -1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 1 \
--auto_find_batch_size 0

args is Namespace(model_id='meta-llama/Llama-2-13b-hf', repo_id='Llama-2-13b-hf-eli5-wiki-1024-test', hub_strategy='every_save', output_dir='./Llama-2-13b-hf_eli5-wiki-1024/models', output_data_dir=None, dataset_path='./artifacts/llama_QA_tokenized_1024:v1', hf_token='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', report_to_wandb=1, wandb_token='93b4fb1b729b939f257d7db15130b3710cad2ebb', epochs=3, max_steps=-1, per_device_train_batch_size=16, per_device_eval_batch_size=16, gradient_accumulation_steps=8, max_seq_length=4096, logging_steps=10, optim='paged_adamw_8bit', lr=0.0002, lora_r=64, lora_alpha=16, weight_decay=0.1, lora_dropout=0.1, load_in_4bit=1, load_in_8bit=0, use_peft=1, gradient_checkpointing=1, bf16=1, group_by_length=1, merge_weights=0, seed=42, warmup_ratio=0.03, project_name='SFT_training_dm', entity='ft-llmmm', run_name='Llama-2-13b-hf_eli5-wiki-1024_09.16.23-18.19.32', load_best_model_at_end=1, use_sagemaker=1, torch_compile=0, use_flash_attention=1, resume_from_checkpoint=1

In [None]:
model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
model_name = model_id.split('/')[-1]

dataset_path = './data/ds_wiki_1024_full'
ds_name = dataset_path.split('/')[-1]

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}'

!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 3 \
--max_steps -1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 0 \
--auto_find_batch_size 0

In [None]:
model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
model_name = model_id.split('/')[-1]

dataset_path = './data/ds_eli5_1024'
ds_name = dataset_path.split('/')[-1]

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}'

!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 3 \
--max_steps -1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--auto_find_batch_size 0

In [None]:
from google.colab import runtime
runtime.unassign()

## Cleaned ELI5 and ELI5-wiki

After running the previous experiments we realized many of the trained models were ending their answer with things like "edit: sorry, caught typos". To avoid this, we reclean the ELI5 dataset to remove any posts that contain "edits" in them.

### Remove Edits

In [None]:
import string
from pprint import pprint

In [None]:
def reclean_ELI5(input_path='./data/llama_tokenized_1024',
                 combined_output_path='./data/ds_SFT_cleaned_edits',
                 ELI5_output_path='./data/ds_ELI5_cleaned_edits',
                 artifact_name='llama_QA_tokenized_1024',
                 seed=213):
    """
    Recleans ELI5 dataset by removing undesirable edits.

    Args:
        input_path (str, optional): Path to the tokenized ELI5 dataset. Default is './data/llama_tokenized_1024'.
        combined_output_path (str, optional): Output path to save the combined SFT dataset. Default is './data/ds_SFT_cleaned_edits'.
        ELI5_output_path (str, optional): Output path to save the cleaned ELI5 dataset. Default is './data/ds_ELI5_cleaned_edits'.
        artifact_name (str, optional): Name for the generated artifact. Default is 'llama_QA_tokenized_1024'.
        seed (int, optional): Seed for shuffling. Default is 213.

    Returns:
        None
    """
    # Load the tokenized ELI5 dataset
    SFT_QA_dataset_llama_1024 = datasets.load_from_disk(input_path)

    # Filter out examples sourced from Simple Wikipedia
    SFT_simple_wiki = SFT_QA_dataset_llama_1024.filter(
        lambda x: x['source'] == 'simple_wiki'
    )

    # Filter out examples sourced from ELI5
    SFT_ELI5 = SFT_QA_dataset_llama_1024.filter(
        lambda x: x['source'] != 'simple_wiki'
    )

    # Define a list of words and patterns indicating edits
    edit_words = ['edit' + ch for ch in string.punctuation]
    edit_words.extend([ch + 'edit' for ch in string.punctuation])
    edit_words.extend(['edit:', ' edit', 'edit ', 'edit-', 'edit,', 'update:',
                      ' update', 'update ', 'thanks', 'thank you', 'typo',
                      ' edited', 'edited ', '[edit]', '[edited]',
                      '[edit', '(edit', '^edit'])
    edit_words = list(set(edit_words))

    # Define a list of words indicating valid editors
    keep_words = [ch + 'edit' for ch in string.ascii_lowercase]
    keep_words.append('editor')

    # Filter out examples with undesirable edits
    SFT_ELI5 = SFT_ELI5.filter(lambda x: not any(ch in x['answer'].lower()
                                                for ch in edit_words))

    # Filter out examples where 'edit' is not relevant or is followed by valid words
    SFT_ELI5 = SFT_ELI5.filter(lambda x: 'edit' not in x['answer'].lower() \
                                or any(word in x['answer'].lower()
                                for word in keep_words))

    # Initialize a DatasetDict to store the combined SFT dataset
    ds_SFT_filtered = datasets.DatasetDict()

    # Concatenate ELI5 and Simple Wikipedia datasets and shuffle
    for key in SFT_ELI5:
        ds_SFT_filtered[key] = datasets.concatenate_datasets([
            SFT_ELI5[key], SFT_simple_wiki[key]
        ])
        ds_SFT_filtered[key] = ds_SFT_filtered[key].shuffle(seed=seed)

    # Save the combined SFT dataset and cleaned ELI5 dataset to disk
    ds_SFT_filtered.save_to_disk(combined_output_path)
    SFT_ELI5.save_to_disk(ELI5_output_path)

    # Log the cleaned datasets as Weights and Biases artifacts
    with wandb.init(project='ELI5_analysis',
                    entity='ft-llmmm',
                    job_type='upload_data',
                    name=f'llama_QA_cleaned_edits') as run:

        clean_data_art = wandb.Artifact(artifact_name, 'dataset')
        clean_data_art.add_dir(combined_output_path)
        run.log_artifact(clean_data_art)

In [None]:
reclean_ELI5()

Saving the dataset (0/1 shards):   0%|          | 0/103847 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5871 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7090 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/38595 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/880 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2093 [00:00<?, ? examples/s]

### Redo Training for ELI5 & ELI5+Wiki

Below we redo the cleaning experiments, but now with the cleaned ELI5 and combined datasets.

In [None]:
model_id = "meta-llama/Llama-2-7b-hf" # sharded weights
dataset_path = './data/ds_SFT_cleaned_edits'
ds_name = 'eli5-cleaned-wiki65k-1024'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
model_name = model_id.split('/')[-1]
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

#ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}'

!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 3 \
--max_steps -1 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--gradient_accumulation_steps 4 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_cleaned_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 1 \
--auto_find_batch_size 0

args is Namespace(model_id='meta-llama/Llama-2-7b-hf', repo_id='Llama-2-7b-hf-eli5-cleaned-wiki65k-1024', hub_strategy='every_save', output_dir='./Llama-2-7b-hf_eli5-cleaned-wiki65k-1024/models', output_data_dir=None, dataset_path='./data/ds_SFT_cleaned_edits', hf_token='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', report_to_wandb=1, wandb_token='93b4fb1b729b939f257d7db15130b3710cad2ebb', epochs=3, max_steps=-1, per_device_train_batch_size=32, per_device_eval_batch_size=32, gradient_accumulation_steps=4, max_seq_length=4096, logging_steps=10, optim='paged_adamw_8bit', lr=0.0002, lora_r=64, lora_alpha=16, weight_decay=0.1, lora_dropout=0.1, load_in_4bit=1, load_in_8bit=0, use_peft=1, gradient_checkpointing=1, bf16=1, group_by_length=1, merge_weights=0, seed=42, warmup_ratio=0.03, project_name='SFT_cleaned_training_dm', entity='ft-llmmm', run_name='eli5-cleaned-wiki65k-1024_09.10.23-03.00.20', load_best_model_at_end=1, use_sagemaker=1, torch_compile=0, use_flash_attention=1, resume_from_check

In [None]:
model_id = "meta-llama/Llama-2-7b-hf" # sharded weights
dataset_path = './data/ds_ELI5_cleaned_edits'
ds_name = 'eli5-cleaned-1024'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
model_name = model_id.split('/')[-1]
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

#ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}'

!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 3 \
--max_steps -1 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--gradient_accumulation_steps 4 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_cleaned_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 0 \
--auto_find_batch_size 0

args is Namespace(model_id='meta-llama/Llama-2-7b-hf', repo_id='Llama-2-7b-hf-eli5-cleaned-1024', hub_strategy='every_save', output_dir='./Llama-2-7b-hf_eli5-cleaned-1024/models', output_data_dir=None, dataset_path='./data/ds_ELI5_cleaned_edits', hf_token='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', report_to_wandb=1, wandb_token='93b4fb1b729b939f257d7db15130b3710cad2ebb', epochs=3, max_steps=-1, per_device_train_batch_size=32, per_device_eval_batch_size=32, gradient_accumulation_steps=4, max_seq_length=4096, logging_steps=10, optim='paged_adamw_8bit', lr=0.0002, lora_r=64, lora_alpha=16, weight_decay=0.1, lora_dropout=0.1, load_in_4bit=1, load_in_8bit=0, use_peft=1, gradient_checkpointing=1, bf16=1, group_by_length=1, merge_weights=0, seed=42, warmup_ratio=0.03, project_name='SFT_cleaned_training_dm', entity='ft-llmmm', run_name='Llama-2-7b-hf_eli5-cleaned-1024_09.10.23-13.27.07', load_best_model_at_end=1, use_sagemaker=1, torch_compile=0, use_flash_attention=1, resume_from_checkpoint=0, 

In [None]:
model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
dataset_path = './data/ds_SFT_cleaned_edits'
ds_name = 'eli5-cleaned-wiki65k-1024'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
model_name = model_id.split('/')[-1]
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

#ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}'

!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 1 \
--max_steps -1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_cleaned_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 1 \
--auto_find_batch_size 0

args is Namespace(model_id='meta-llama/Llama-2-13b-hf', repo_id='Llama-2-13b-hf-eli5-cleaned-wiki65k-1024', hub_strategy='every_save', output_dir='./Llama-2-13b-hf_eli5-cleaned-wiki65k-1024/models', output_data_dir=None, dataset_path='./data/ds_SFT_cleaned_edits', hf_token='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', report_to_wandb=1, wandb_token='93b4fb1b729b939f257d7db15130b3710cad2ebb', epochs=1, max_steps=-1, per_device_train_batch_size=16, per_device_eval_batch_size=16, gradient_accumulation_steps=8, max_seq_length=4096, logging_steps=10, optim='paged_adamw_8bit', lr=0.0002, lora_r=64, lora_alpha=16, weight_decay=0.1, lora_dropout=0.1, load_in_4bit=1, load_in_8bit=0, use_peft=1, gradient_checkpointing=1, bf16=1, group_by_length=1, merge_weights=0, seed=42, warmup_ratio=0.03, project_name='SFT_cleaned_training_dm', entity='ft-llmmm', run_name='Llama-2-13b-hf_eli5-cleaned-wiki65k-1024_09.10.23-21.54.03', load_best_model_at_end=1, use_sagemaker=1, torch_compile=0, use_flash_attention=1,

In [None]:
model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
dataset_path = './data/ds_ELI5_cleaned_edits'
ds_name = 'eli5-cleaned-1024'

now = datetime.now()
time_stamp = now.strftime("%m.%d.%y-%H.%M.%S")

#model_name = model_id.replace('/','-')
model_name = model_id.split('/')[-1]
#ds_name = dataset_path.split('/')[-1].replace('llama','combined_large').replace(':','-')

#ds_name = dataset_path.split('/')[-1]
output_dir = f'./{model_name}_{ds_name}/models'
logging_dir = f'{output_dir}/logs'

run_name = f'{model_name}_{ds_name}_{time_stamp}'
optim = 'paged_adamw_8bit'

from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(logging_dir).mkdir(parents=True, exist_ok=True)

repo_id = f'{model_name}-{ds_name}'

!python ./run_clm.py \
--output_dir {output_dir} \
--logging_dir {logging_dir} \
--model_id {model_id} \
--dataset_path {dataset_path} \
--run_name {run_name} \
--repo_id {repo_id} \
--report_to_wandb 1 \
--epochs 1 \
--max_steps -1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--lr 2e-4 \
--entity 'ft-llmmm' \
--project_name 'SFT_cleaned_training_dm' \
--hub_strategy 'every_save' \
--torch_compile 0 \
--gradient_checkpointing 1 \
--optim 'paged_adamw_8bit' \
--group_by_length 1 \
--hf_token {hf_token} \
--wandb_token {wandb_token} \
--use_flash_attention 1 \
--logging_steps 10 \
--resume_from_checkpoint 0 \
--auto_find_batch_size 0

args is Namespace(model_id='meta-llama/Llama-2-13b-hf', repo_id='Llama-2-13b-hf-eli5-cleaned-1024', hub_strategy='every_save', output_dir='./Llama-2-13b-hf_eli5-cleaned-1024/models', output_data_dir=None, dataset_path='./data/ds_ELI5_cleaned_edits', hf_token='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', report_to_wandb=1, wandb_token='93b4fb1b729b939f257d7db15130b3710cad2ebb', epochs=1, max_steps=-1, per_device_train_batch_size=16, per_device_eval_batch_size=16, gradient_accumulation_steps=8, max_seq_length=4096, logging_steps=10, optim='paged_adamw_8bit', lr=0.0002, lora_r=64, lora_alpha=16, weight_decay=0.1, lora_dropout=0.1, load_in_4bit=1, load_in_8bit=0, use_peft=1, gradient_checkpointing=1, bf16=1, group_by_length=1, merge_weights=0, seed=42, warmup_ratio=0.03, project_name='SFT_cleaned_training_dm', entity='ft-llmmm', run_name='Llama-2-13b-hf_eli5-cleaned-1024_09.10.23-23.22.01', load_best_model_at_end=1, use_sagemaker=1, torch_compile=0, use_flash_attention=1, resume_from_checkpoint

In [None]:
from google.colab import runtime
runtime.unassign()

#Merging Weights

In this section, we define functions that will be used to merge the QLoRA models. There are two different ways to merge models, either we quantize and dequantize the base model, and merge that model with the LoRA layer, or we directly merge the original base model with the LoRA layers. The first one is the "careful" merge and the second is the "naive" merge.

In [None]:
import torch
import peft
import json
import shutil
from peft.utils import _get_submodules
import os
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
from peft import PeftModel
from transformers import (AutoModelForCausalLM,
                          LlamaForCausalLM,
                          LlamaTokenizer,
                          BitsAndBytesConfig,
                          AutoTokenizer
)
import gc
import copy
from getpass import getpass

## Careful Merge

In [None]:
def dequantize_model(model, tokenizer, dtype=torch.bfloat16, device="cuda"):
    """
    Dequantizes a peftmodel that was trained with qlora using 4-bit quantization.

    Args:
        model (nn.Module): The peftmodel loaded with qlora.
        tokenizer: The corresponding Hugging Face's tokenizer for the model.
        dtype (torch.dtype, optional): Data type to use for dequantization. Default is torch.bfloat16.
        device (str, optional): Device to load the dequantized model. Default is "cuda".

    Returns:
        nn.Module: The dequantized model.
    """
    # Define the class for 4-bit quantization
    cls = bnb.nn.Linear4bit

    with torch.no_grad():
        for name, module in model.named_modules():
            # Check if the module is an instance of the 4-bit quantization class
            if isinstance(module, cls):
                print(f"Dequantizing `{name}`...")
                quant_state = copy.deepcopy(module.weight.quant_state)

                # Set the desired dtype for dequantization
                quant_state[2] = dtype

                # Dequantize the weights
                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

                # Create a new module with the dequantized weights
                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                new_module.weight = torch.nn.Parameter(weights)
                new_module.to(device=device, dtype=dtype)

                # Replace the original module with the dequantized one
                parent, target, target_name = _get_submodules(model, name)
                setattr(parent, target_name, new_module)

        # A hack to avoid Hugging Face's saving error, as it does not support saving a model registered for 4-bit loading.
        model.is_loaded_in_4bit = False
        return model

def merge_weights(base_model_id,
                  adapter_model_id,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda"):
    """
    Merges the weights of a base model and an adapter model, and pushes the merged model to the Hugging Face Model Hub.

    Args:
        base_model_id (str): Hugging Face model ID for the base model.
        adapter_model_id (str): Hugging Face model ID for the adapter model.
        hf_token (str): Hugging Face authentication token.
        dtype (torch.dtype, optional): Data type to use for quantization. Default is torch.bfloat16.
        device (str, optional): Device to load the model. Default is "cuda".

    Returns:
        None
    """
    # Create a unique repository ID for the merged model
    repo_id = adapter_model_id+'_merged'

    # Define quantization configuration
    quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )

    # Load the base model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config,
        device_map={"": 0},
        use_auth_token=hf_token
    )

    # Initialize tokenizer for the base model
    tok = AutoTokenizer.from_pretrained(base_model_id,
                                         use_auth_token=hf_token
                                        )

    # Dequantize the base model
    model = dequantize_model(model, tok)

    # Load the adapter model
    model = PeftModel.from_pretrained(model=model, model_id=adapter_model_id)

    # Merge and unload the models
    model = model.merge_and_unload()

    # Push the merged model and tokenizer to the Hugging Face Model Hub
    model.push_to_hub(repo_id, safe_serialization=True)
    tok.push_to_hub(repo_id)

Below, we perform the careful merge for the 7B and 13B models we fine-tuned.

In [None]:
adapter_models = [
    'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16',
    'dhmeltzer/llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16',
    'dhmeltzer/llama-7b-SFT_ds_eli5_1024_r_64_alpha_16',
    'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-1024_qlora',
    'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora',
]
base_model_id = 'meta-llama/Llama-2-7b-hf'

for adapter_model in adapter_models:
    merge_weights(base_model_id,
                  adapter_model,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda")

In [None]:
adapter_models = [
    'dhmeltzer/Llama-2-13b-hf-eli5-cleaned-1024_qlora',
    'dhmeltzer/Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora',
    'dhmeltzer/Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16',
    'dhmeltzer/Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16',
    'dhmeltzer/Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16'
]

base_model_id = 'meta-llama/Llama-2-13b-hf'

for adapter_model in adapter_models:
    merge_weights(base_model_id,
                  adapter_model,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda")

## Simple Merge

Next, we define a function to perform a simple merge and use that on our fine-tuned LoRA models.

In [None]:
def simple_merge(base_model_id,
                  adapter_model_ids,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda"):
    """
    Merges a base model with multiple adapter models and pushes the merged models to the Hugging Face Model Hub.

    Args:
        base_model_id (str): Hugging Face model ID for the base model.
        adapter_model_ids (list): List of Hugging Face model IDs for the adapter models.
        hf_token (str): Hugging Face authentication token.
        dtype (torch.dtype, optional): Data type to use for quantization. Default is torch.bfloat16.
        device (str, optional): Device to load the model. Default is "cuda".

    Returns:
        None
    """
    # Load the base model with bfloat16 dtype
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        return_dict=True,
        torch_dtype=torch.bfloat16,
    )

    # Initialize tokenizer for the base model
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)

    # Iterate through adapter models and perform simple merge
    for adapter_model_id in adapter_model_ids:
        # Load adapter model and perform merge
        peft_model = PeftModel.from_pretrained(model, adapter_model_id)
        peft_model.eval()
        peft_model = peft_model.merge_and_unload()

        # Push the merged model and tokenizer to the Hugging Face Model Hub
        peft_model.push_to_hub(f'{adapter_model_id}_simple_merge')
        tokenizer.push_to_hub(f'{adapter_model_id}_simple_merge')

Below, we perform the simpler merge for various 7B models. We did not perform the simple merge for 13B models since the simple merge leads to worse results.

In [None]:
base_model_id='meta-llama/Llama-2-7b-hf'
adapter_model_ids=[
    'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-1024_qlora',
    'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora',
    'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16',
    'dhmeltzer/llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16',
    'dhmeltzer/llama-7b-SFT_ds_eli5_1024_r_64_alpha_16'
]

simple_merge(base_model_id,
                  adapter_model_ids,
                  hf_token,
                  dtype=torch.bfloat16,
                  device="cuda")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading (…)er_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading (…)er_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

# Scratch (ignore)

## Inference

### Computing Predictions

In [None]:
from huggingface_hub import login
from collections import defaultdict
import transformers
from transformers import AutoTokenizer
from tqdm import tqdm
from peft import PeftModel
import pickle
import os
import pandas as pd
from transformers import pipeline

In [None]:
def inference_formatting(example):
    return f"### Human: {example}\n ### Assistant:"

def generate_examples(model,
                      tokenizer,
                      data,
                      padding=True,
                      max_new_tokens=512):
    generation_config = transformers.GenerationConfig(num_beams = 1,
                                         max_new_tokens = max_new_tokens,
                                         do_sample = True,
                                         temperature = .6,
                                         top_p = 0.9,
                                         repetition_penalty = 1.2,
                                         #pad_token_id = model.config.eos_token_id
                                        )

    prompts = data['prompt']

    input = tokenizer(prompts, return_tensors = 'pt', padding = padding).to('cuda')

    output_ids = model.generate(input_ids = input['input_ids'],
                                attention_mask = input['attention_mask'],
                                generation_config = generation_config,
                                )

    predictions =  [tokenizer.decode(ids, skip_special_tokens = True) for ids in output_ids]

    return predictions

def generate_df_predictions(model_ids,
                            ds,
                            output_dir,
                            batch_size=16,
                            seed = 50,
                            size = 100,
                            padding=True,
                            predictions_dir = './val_results_new_merge'):

    os.makedirs(output_dir,exist_ok=True)
    os.makedirs(predictions_dir,exist_ok=True)
    rouge = evaluate.load('rouge')
    bertscore = evaluate.load("bertscore")

    ds_small = {}
    predictions = defaultdict(list)

    for base_model, model_id in model_ids:
        print(f'working on model {model_id.split("/")[-1]}')

        model_name = model_id.split('/')[-1]



        if all(os.path.exists(f'{predictions_dir}/{model_name}_{ds_name}.pkl')\
               for ds_name in ds):

            for ds_name in ds:
                file_pkl = f'{predictions_dir}/{model_name}_{ds_name}.pkl'
                with open(file_pkl,'rb') as f:
                    predictions[model_name,ds_name]=pickle.load(f)
            continue

        tokenizer = AutoTokenizer.from_pretrained(model_id)

        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"

        if base_model:

            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                device_map="auto",
                torch_dtype = torch.bfloat16
                #quantization_config=bnb_config
                )

            model = PeftModel.from_pretrained(model = model,
                            model_id = model_id,
                            torch_dtype = torch.bfloat16,
                            is_trainable = False)
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                torch_dtype = torch.bfloat16,
               # quantization_config=bnb_config
                )

        model.resize_token_embeddings(len(tokenizer))
        model.eval()


        device='cuda' if torch.cuda.is_available() else 'cpu'

        try:
            model.to(device)
        except:
            pass

        for ds_name in ds:

            ds_small = ds[ds_name]['validation'].map(lambda x: {'prompt':inference_formatting(x['question'])})
            ds_small = ds_small.shuffle(seed=seed)
            ds_small = ds_small.select(range(size))

            print(f'working on dataset {ds_name}')

            file_pkl = f'{predictions_dir}/{model_name}_{ds_name}.pkl'
            if os.path.exists(file_pkl):
                with open(file_pkl,'rb') as f:
                    predictions[model_name,ds_name]=pickle.load(f)
            else:
                for k in tqdm(range(0,len(ds_small),batch_size)):
                    prediction = generate_examples(model,tokenizer, ds_small[k:k+batch_size],padding=padding)
                    predictions[model_name,ds_name].extend(prediction)

                    with open(f'{predictions_dir}/{model_name}_{ds_name}.pkl', 'wb') as f:
                        pickle.dump(predictions[model_name,ds_name], f)

            rouge_scores = {}
            bert_scores = {}

        del model

    for model_name, ds_name in predictions:
        print(f'computing predictions for {(model_name,ds_name)}')

        preds = predictions[(model_name,ds_name)]

        preds_file = output_dir+f'/{model_name}_{ds_name}_predictions.csv'
        rouge_file = output_dir+f'/{model_name}_{ds_name}_rouge.csv'
        bertscore_file = output_dir+f'/{model_name}_{ds_name}_bertscore.csv'

        if not os.path.exists(rouge_file):

            rouge_scores[(model_name,ds_name)] = rouge.compute(
                predictions = preds,
                references = ds_small['QA']
            )
            df_rouge = pd.DataFrame(rouge_scores[(model_name,ds_name)],
                                    index=[0])
            df_rouge.to_csv(rouge_file)

        if not os.path.exists(bertscore_file):

            bert_scores[(model_name,ds_name)] = bertscore.compute(
                predictions = preds,
                references = ds_small['QA'],
                lang='en')

            df_bert = pd.DataFrame(bert_scores[(model_name,ds_name)],
                                   )
            df_bert.to_csv(bertscore_file)

        gc.collect()
        torch.cuda.empty_cache()

        df_preds = pd.DataFrame(preds)
        df_preds.to_csv(preds_file,index=True)


In [None]:
with wandb.init(project='SFT_training_DM',
                entity='ft-llmmm',
                job_type='download_data',
                name=f'download_combined_data') as run:

    artifact = run.use_artifact('ft-llmmm/ELI5_analysis/llama_QA_tokenized_1024:v1', type='dataset')
    artifact_dir = artifact.download()

ds = {}
ds['full'] = datasets.load_from_disk(artifact_dir)
ds['wiki'] = ds['full'].filter(lambda x: x['source']=='simple_wiki')
ds['eli5'] = ds['full'].filter(lambda x: x['source']!='simple_wiki')

[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/drive/MyDrive/LLMs/Fine-tuning/SFT/wandb/run-20230919_202615-uhqd5g6s[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdownload_combined_data[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ft-llmmm/SFT_training_dm[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ft-llmmm/SFT_training_dm/runs/uhqd5g6s[0m
[34m[1mwandb[0m: Downloading large artifact llama_QA_tokenized_1024:v1, 263.39MB. 10 files... 
[34m[1mwandb[0m:   10 of 10 files downloaded.  
Done. 0:0:1.8
[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 🚀 View run [33mdownload_combined_data[0m at: [34m[4mhttps://wandb.ai/ft-llmmm/SFT_training_dm/runs/uhqd5g6s[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other fil

In [None]:
model_ids = []
model_ids.append((None,'meta-llama/Llama-2-7b-hf'))

model_ids.append((None,'dhmeltzer/llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged'))

model_ids.append((None,'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged'))

In [None]:
generate_df_predictions(model_ids,
                        ds,
                        './llama-2-inference-512',
                        batch_size=8,
                        padding=True,
                        predictions_dir = './val_results_512')

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
model_ids = []
model_ids.append((None,'meta-llama/Llama-2-13b-hf'))

generate_df_predictions(model_ids,
                        ds,
                        './llama-2-inference-512',
                        batch_size=4,
                        padding=True,
                        predictions_dir = './val_results_512')


model_ids = []
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged'))

generate_df_predictions(model_ids,
                        ds,
                        './llama-2-inference-512',
                        batch_size=4,
                        padding=True,
                        predictions_dir = './val_results_512')

from google.colab import runtime
runtime.unassign()

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

working on model Llama-2-13b-hf


Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


Downloading (…)lve/main/config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

working on dataset full


100%|██████████| 25/25 [10:12<00:00, 24.51s/it]


working on dataset wiki


100%|██████████| 25/25 [10:07<00:00, 24.29s/it]


working on dataset eli5


100%|██████████| 25/25 [10:07<00:00, 24.30s/it]


computing predictions for ('Llama-2-13b-hf', 'full')


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


computing predictions for ('Llama-2-13b-hf', 'wiki')
computing predictions for ('Llama-2-13b-hf', 'eli5')
working on model Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged


Downloading (…)okenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

working on dataset full


100%|██████████| 25/25 [10:13<00:00, 24.55s/it]


working on dataset wiki


100%|██████████| 25/25 [10:13<00:00, 24.55s/it]


working on dataset eli5


100%|██████████| 25/25 [10:14<00:00, 24.59s/it]


computing predictions for ('Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 'full')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


computing predictions for ('Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 'wiki')
computing predictions for ('Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 'eli5')


In [None]:
model_ids = []

model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged'))

generate_df_predictions(model_ids,
                        ds,
                        './llama-2-inference-512',
                        batch_size=2,
                        padding=True,
                        predictions_dir = './val_results_512')

from google.colab import runtime
runtime.unassign()

working on model Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged
working on model Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged


Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

working on dataset full


100%|██████████| 50/50 [20:35<00:00, 24.71s/it]


working on dataset wiki


100%|██████████| 50/50 [20:27<00:00, 24.56s/it]


working on dataset eli5


100%|██████████| 50/50 [20:36<00:00, 24.73s/it]


working on model Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged


Downloading (…)okenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

working on dataset full


  0%|          | 0/50 [03:34<?, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-79fc989ea668>", line 8, in <cell line: 8>
    generate_df_predictions(model_ids,
  File "<ipython-input-5-0833b95545d8>", line 115, in generate_df_predictions
    prediction = generate_examples(model,tokenizer, ds_small[k:k+batch_size],padding=padding)
  File "<ipython-input-5-0833b95545d8>", line 22, in generate_examples
    output_ids = model.generate(input_ids = input['input_ids'],
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 1652, in generate
    return self.sample(
  File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2734, in sample
    outputs = self

In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [None]:
from google.colab import runtime
runtime.unassign()

## Analyzing Results

### 7B models

In [None]:
import numpy as np

In [None]:
model_names = []
model_names.append('Llama-2-7b-hf')

model_names.append('llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged')
model_names.append('llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged')
model_names.append('llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged')

model_names.append('Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged')
model_names.append('Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged')

In [None]:
df_predictions = pd.DataFrame()

for model_name in model_names:
    for ds_name in ['full','wiki','eli5']:
        predictions_file = f'./llama-2-inference-512/{model_name}_{ds_name}_predictions.csv'

        temp = pd.read_csv(predictions_file,
                                     index_col='Unnamed: 0')
        temp = temp.T
        temp['model_name'] = model_name
        temp['dataset'] = ds_name

        df_predictions = pd.concat(
            [df_predictions,temp])

df_predictions = df_predictions.set_index(['model_name','dataset'])

In [None]:
df_rouge = pd.DataFrame()

for model_name in model_names:
    for ds_name in ['full','wiki','eli5']:
        file_name = f'./llama-2-inference-512/{model_name}_{ds_name}_rouge.csv'

        df_temp = pd.read_csv(file_name,
                                     index_col='Unnamed: 0')
        #df_temp = df_temp.T
        df_temp['model_name'] = model_name
        df_temp['dataset'] = ds_name

        df_rouge = pd.concat(
            [df_rouge,df_temp])

df_rouge = df_rouge.set_index(['model_name','dataset'])

In [None]:
df_bertscore = pd.DataFrame()

for model_name in model_names:
    for ds_name in ['full','wiki','eli5']:
        file_name = f'./llama-2-inference-512/{model_name}_{ds_name}_bertscore.csv'

        temp = pd.read_csv(file_name,
                                     index_col='Unnamed: 0')

        temp = pd.DataFrame(temp[['precision','recall','f1']].mean()).T
        temp['model_name'] = model_name
        temp['dataset'] = ds_name

        df_bertscore = pd.concat(
            [df_bertscore,temp])

df_bertscore = df_bertscore.set_index(['model_name','dataset'])

In [None]:
!pip install textstat --quiet
from textstat import flesch_reading_ease as fre
from textstat import flesch_kincaid_grade as fkg

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m706.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pprint import pprint

In [None]:
df_predictions.applymap(fkg).mean(axis=1)

model_name                                            dataset
Llama-2-7b-hf                                         full       11.636
                                                      wiki       11.013
                                                      eli5       10.799
llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged     full       10.681
                                                      wiki       10.695
                                                      eli5       13.777
llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged   full       10.314
                                                      wiki        9.982
                                                      eli5        9.237
llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged        full        7.855
                                                      wiki        7.606
                                                      eli5        8.509
Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged  full       10.054
  

In [None]:
df_predictions.applymap(fre).mean(axis=1)

model_name                                            dataset
Llama-2-7b-hf                                         full       48.9020
                                                      wiki       51.4365
                                                      eli5       52.6770
llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged     full       55.5700
                                                      wiki       55.8206
                                                      eli5       44.7725
llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged   full       56.8602
                                                      wiki       57.5070
                                                      eli5       61.9205
llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged        full       68.6513
                                                      wiki       69.1257
                                                      eli5       65.0010
Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged  full    

In [None]:
for idx in df_predictions.index:
    if idx[1]!='full':
        continue
    pprint(f'Model: {idx[0]}')
    print('-'*100)
    pprint(df_predictions.loc[idx][70])
    print('-'*100)

In [None]:
test=df_predictions.loc[idx][70]
list(map(lambda x:x.strip(),test))

### GPT-4 As Judge at 7B

In [None]:
os.environ['OPENAI_API_KEY']=getpass()
!pip install openai
import openai

Installing collected packages: openai
Successfully installed openai-0.28.0


In [None]:
import time
API_MAX_RETRY = 16
API_RETRY_SLEEP = 10
API_ERROR_OUTPUT = "$ERROR$"
system_message = 'You are a helpful assistant.'


def get_GPT_judgement(QA_pair,
                       model='gpt-4',
                       temperature = 0,
                       max_tokens = 2048,
                       system_message='You are a helpful assistant.'):

    QA_pair = QA_pair.split('### Human:')[1].split('### Assistant:')
    QA_pair = list(map(lambda x:x.strip(),QA_pair))

    question = QA_pair[0]
    answer = QA_pair[1]

    prompt = f"""[Instruction]\nPlease act as an impartial judge and
    evaluate the quality of the response provided by an AI assistant
    to the user question displayed below. Your evaluation should consider
    factors such as the following:\n\n- **Simplicity**: Is the response
    clear and straightforward enough for middle or high school students to
    understand?\n- **Helpfulness**: Does the response effectively address
    the query?\n- **Relevance**: Does the response directly pertain to the
    question?\n- **Accuracy**: Is the response factually correct?\n-
    **Depth & Creativity**: Assess richness without favoring excessive
    detail.\n\nAim for a holistic assessment. Begin your evaluation by
    providing a short explanation. Be as objective as possible. After
    providing your explanation, you must rate the response on a scale
    of 1 to 10 by strictly following this format: \'[[rating]]\',
    for example: \'Rating: [[5]]\'.\n\n[Question]\n{question}\n\n
    [The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's
    Answer]"""

    output = API_ERROR_OUTPUT
    for _ in range(API_MAX_RETRY):
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": prompt}
                    ],
                n=1,
                temperature=temperature,
                max_tokens=max_tokens,
            )
            output = response["choices"][0]["message"]["content"]
            break
        except openai.error.OpenAIError as e:
            print(type(e), e)
            time.sleep(API_RETRY_SLEEP)

    return output

In [None]:
from collections import defaultdict
from tqdm import tqdm

In [None]:
df_GPT_35_judgements = defaultdict(dict)

In [None]:
model_names = sorted(list(set([idx[0] for
                   idx in df_predictions.index])))

for model_name in model_names:
    print(f'working on model {model_name}')
    for j in tqdm(range(100)):
        QA_pair = df_predictions.loc[(model_name,'full')][j]
        if j not in df_GPT_35_judgements[model_name]:
            df_GPT_35_judgements[model_name][j] = get_GPT_judgement(QA_pair,
                                                                    model='gpt-3.5-turbo')

In [None]:
try:
    x is not None
except:
    print('not defined')

not defined


In [None]:
try:
    df_GPT_4_judgements
except:
    print('reinitialize GPT-4 judgements')
    df_GPT_4_judgements = defaultdict(dict)

In [None]:

model_names = sorted(list(set([idx[0] for
                   idx in df_predictions.index])))

for model_name in model_names:
    print(f'working on model {model_name}')
    for j in tqdm(range(100)):
        QA_pair = df_predictions.loc[(model_name,'full')][j]
        if j not in df_GPT_4_judgements[model_name]:
            df_GPT_4_judgements[model_name][j] = get_GPT_judgement(QA_pair,
                                                                    model='gpt-4')

In [None]:
GPT35_judgements = pd.DataFrame(df_GPT_35_judgements)

In [None]:
GPT35_judgements.to_csv('./llama-2-inference-512/GPT35_judgements.csv')

In [None]:
GPT4_judgements = pd.DataFrame(df_GPT_4_judgements)

In [None]:
GPT4_judgements.to_csv('./llama-2-inference-512/GPT4_judgements.csv')

In [None]:
def find_rating(string):
    idx = string.index('[[')
    return string[idx+2]

In [None]:
df_predictions.to_csv('./llama-2-inference-512/predictions_combined.csv')

In [None]:
df_predictions.loc[
    ('Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged','full')][13]

In [None]:
GPT35_judgements['Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged'][13]

In [None]:
GPT35_judgements=pd.read_csv(
    './llama-2-inference-512/GPT35_judgements.csv',
    index_col='Unnamed: 0')

In [None]:
GPT35_judgements

In [None]:
scores_35 = GPT35_judgements.T.applymap(find_rating).applymap(int).mean(axis=1)

sorted(list(scores_35.items()),key = lambda x:x[1])

[('llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged', 2.97),
 ('Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged', 3.5),
 ('Llama-2-7b-hf', 4.34),
 ('llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged', 4.53),
 ('llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged', 4.75),
 ('Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 5.05)]

In [None]:
scores4 = GPT4_judgements.T.applymap(find_rating).applymap(int).mean(axis=1)

In [None]:
sorted(list(scores4.items()),key = lambda x:x[1])

[('llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged', 2.43),
 ('Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged', 2.58),
 ('Llama-2-7b-hf', 2.92),
 ('llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged', 3.45),
 ('llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged', 3.58),
 ('Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 3.89)]

### Analysis at 13B

In [None]:
import numpy as np

In [None]:
df_predictions = pd.read_csv('./llama-2-13b-inference/predictions.csv')
df_predictions.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_predictions.columns]
df_predictions = df_predictions.T.reset_index()
df_predictions.iloc[0,1] = 'dataset'
df_predictions.columns = df_predictions.iloc[0,:]
df_predictions = df_predictions.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_predictions = df_predictions.set_index(['model','dataset'])
df_predictions = df_predictions.swaplevel().unstack().loc[['eli5','wiki','full']].stack()

In [None]:
df_rouge = pd.read_csv('./llama-2-13b-inference/rouge.csv')
df_rouge.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_rouge.columns]
df_rouge = df_rouge.T.reset_index()
df_rouge.iloc[0,1] = 'dataset'
df_rouge.columns = df_rouge.iloc[0,:]
df_rouge = df_rouge.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_rouge = df_rouge.set_index(['model','dataset'])
df_rouge = df_rouge.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
df_rouge.applymap(lambda x:np.round(eval(x),4))

Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1,rouge2,rougeL,rougeLsum
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
eli5,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged,0.3731,0.213,0.2753,0.2853
wiki,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged,0.181,0.0089,0.0895,0.118
full,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged,0.1875,0.0089,0.0902,0.1209


In [None]:
df_bertscore = pd.read_csv('./llama-2-13b-inference/bertscore.csv')
df_bertscore.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_bertscore.columns]
df_bertscore = df_bertscore.T.reset_index()
df_bertscore.iloc[0,1] = 'dataset'
df_bertscore.columns = df_bertscore.iloc[0,:]
df_bertscore = df_bertscore.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_bertscore = df_bertscore.set_index(['model','dataset'])
df_bertscore = df_bertscore.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
df_bertscore = df_bertscore[[col for col in df_bertscore.columns if 'hash' not in col]]
df_bertscore = df_bertscore.applymap(eval).applymap(np.mean)
df_bertscore.applymap(lambda x:np.round(x,4))

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eli5,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged,0.8414,0.8808,0.8603
wiki,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged,0.779,0.8009,0.7897
full,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged,0.7822,0.8039,0.7928


In [None]:
!pip install textstat --quiet
from textstat import flesch_reading_ease as fre
from textstat import flesch_kincaid_grade as fkg

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m102.4/105.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/2.0 MB[0m [31m19.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m1.4/2.0 MB[0m [31m21.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.0 MB[0m [31m23.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.1 MB/s[0m eta [

In [None]:
df_predictions.applymap(fre).mean(axis=1)

dataset  model                                     
eli5     Llama-2-13b-hf-eli5-wiki-1024_qlora_merged    69.0032
wiki     Llama-2-13b-hf-eli5-wiki-1024_qlora_merged    69.4186
full     Llama-2-13b-hf-eli5-wiki-1024_qlora_merged    68.6850
dtype: float64

In [None]:
df_predictions.applymap(fkg).mean(axis=1)

dataset  model                                     
eli5     Llama-2-13b-hf-eli5-wiki-1024_qlora_merged    8.006
wiki     Llama-2-13b-hf-eli5-wiki-1024_qlora_merged    7.269
full     Llama-2-13b-hf-eli5-wiki-1024_qlora_merged    7.592
dtype: float64

In [None]:
df_predictions.loc[('full','Llama-2-13b-hf-eli5-wiki-1024_qlora_merged')].to_frame()


Unnamed: 0_level_0,full
Unnamed: 0_level_1,Llama-2-13b-hf-eli5-wiki-1024_qlora_merged
0,Unnamed: 1_level_2
0.0,### Human: What was the purpose of Apollo 10?\...
1.0,"### Human: Who is Hervé Barulea, also known as..."
2.0,### Human: Who was Danny Murphy and what were ...
3.0,### Human: Who was David Azulai and when did h...
4.0,### Human: What is a song and what are some di...
...,...
95.0,### Human: Who is Michael Blunden?\n ### Assis...
96.0,### Human: What is a damson and how is it diff...
97.0,### Human: Who was Nat King Cole?\n ### Assist...
98.0,### Human: What is the official currency of Si...


### simple merge

In [None]:
import numpy as np

In [None]:
df_predictions = pd.read_csv('./llama-2-7b-inference-simple-merge/predictions.csv')
df_predictions.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_predictions.columns]
df_predictions = df_predictions.T.reset_index()
df_predictions.iloc[0,1] = 'dataset'
df_predictions.columns = df_predictions.iloc[0,:]
df_predictions = df_predictions.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_predictions = df_predictions.set_index(['model','dataset'])
df_predictions = df_predictions.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
pd.DataFrame(df_predictions.loc[('wiki','llama-7b-SFT_ds_eli5')])

Unnamed: 0_level_0,wiki
Unnamed: 0_level_1,llama-7b-SFT_ds_eli5
0,Unnamed: 1_level_2
0.0,### Human: What is the purpose of the 7.62 x 3...
1.0,### Human: Who is Dave Sims?\n ### Assistant: ...
2.0,### Human: What was the name of the French col...
3.0,### Human: What is the name and some character...
4.0,### Human: What is the Rambla de las Ovejas an...
...,...
95.0,### Human: What happens to the body during sta...
96.0,### Human: Who is Alfred George James Hayes?\n...
97.0,### Human: What is the chemical formula and pr...
98.0,### Human: Who was Mary Holt and what were her...


In [None]:
df_rouge = pd.read_csv('./llama-2-7b-inference-simple-merge/rouge.csv')
df_rouge.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_rouge.columns]
df_rouge = df_rouge.T.reset_index()
df_rouge.iloc[0,1] = 'dataset'
df_rouge.columns = df_rouge.iloc[0,:]
df_rouge = df_rouge.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_rouge = df_rouge.set_index(['model','dataset'])
df_rouge = df_rouge.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
df_rouge.applymap(lambda x:np.round(eval(x),4))

Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1,rouge2,rougeL,rougeLsum
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
eli5,Llama-2-7b-hf,0.3797,0.2433,0.2987,0.3232
eli5,llama-7b-SFT_ds_eli5,0.3694,0.2087,0.2693,0.2784
eli5,llama-7b-SFT_ds_wiki65k,0.3608,0.2071,0.266,0.2761
eli5,llama-7b-SFT_eli5_wiki65k,0.3666,0.2102,0.2708,0.2794
wiki,Llama-2-7b-hf,0.1923,0.0103,0.0938,0.1394
wiki,llama-7b-SFT_ds_eli5,0.1916,0.0094,0.092,0.1222
wiki,llama-7b-SFT_ds_wiki65k,0.1897,0.0091,0.0899,0.1197
wiki,llama-7b-SFT_eli5_wiki65k,0.1774,0.0081,0.0875,0.1138
full,Llama-2-7b-hf,0.1948,0.0087,0.0905,0.14
full,llama-7b-SFT_ds_eli5,0.1948,0.0097,0.0903,0.1224


In [None]:
df_bertscore = pd.read_csv('./llama-2-7b-inference-simple-merge/bertscore.csv')
df_bertscore.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_bertscore.columns]
df_bertscore = df_bertscore.T.reset_index()
df_bertscore.iloc[0,1] = 'dataset'
df_bertscore.columns = df_bertscore.iloc[0,:]
df_bertscore = df_bertscore.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_bertscore = df_bertscore.set_index(['model','dataset'])
df_bertscore = df_bertscore.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
df_bertscore = df_bertscore[[col for col in df_bertscore.columns if 'hash' not in col]]
df_bertscore = df_bertscore.applymap(eval).applymap(np.mean)
df_bertscore.applymap(lambda x:np.round(x,4))

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eli5,Llama-2-7b-hf,0.8429,0.8754,0.8583
eli5,llama-7b-SFT_ds_eli5,0.841,0.8804,0.8599
eli5,llama-7b-SFT_ds_wiki65k,0.8419,0.8811,0.8607
eli5,llama-7b-SFT_eli5_wiki65k,0.8395,0.8811,0.8594
wiki,Llama-2-7b-hf,0.7879,0.8067,0.797
wiki,llama-7b-SFT_ds_eli5,0.7835,0.8027,0.7929
wiki,llama-7b-SFT_ds_wiki65k,0.7806,0.8018,0.7909
wiki,llama-7b-SFT_eli5_wiki65k,0.7765,0.801,0.7884
full,Llama-2-7b-hf,0.794,0.8092,0.8014
full,llama-7b-SFT_ds_eli5,0.788,0.8055,0.7966


In [None]:
!pip install textstat
from textstat import flesch_reading_ease as fre
from textstat import flesch_kincaid_grade as fkg



In [None]:
df_predictions.applymap(fre).mean(axis=1)

dataset  model                    
eli5     Llama-2-7b-hf                58.2425
         llama-7b-SFT_ds_eli5         70.7408
         llama-7b-SFT_ds_wiki65k      65.6577
         llama-7b-SFT_eli5_wiki65k    63.1168
wiki     Llama-2-7b-hf                64.1040
         llama-7b-SFT_ds_eli5         71.6032
         llama-7b-SFT_ds_wiki65k      70.5064
         llama-7b-SFT_eli5_wiki65k    64.1275
full     Llama-2-7b-hf                65.0452
         llama-7b-SFT_ds_eli5         71.6479
         llama-7b-SFT_ds_wiki65k      69.9379
         llama-7b-SFT_eli5_wiki65k    65.2472
dtype: float64

In [None]:
df_predictions.applymap(fkg).mean(axis=1)

dataset  model                    
eli5     Llama-2-7b-hf                10.075
         llama-7b-SFT_ds_eli5          7.562
         llama-7b-SFT_ds_wiki65k       8.926
         llama-7b-SFT_eli5_wiki65k     9.465
wiki     Llama-2-7b-hf                 8.172
         llama-7b-SFT_ds_eli5          7.114
         llama-7b-SFT_ds_wiki65k       7.371
         llama-7b-SFT_eli5_wiki65k     8.790
full     Llama-2-7b-hf                 8.472
         llama-7b-SFT_ds_eli5          7.223
         llama-7b-SFT_ds_wiki65k       7.587
         llama-7b-SFT_eli5_wiki65k     8.584
dtype: float64

### Predictions old

In [None]:
df_predictions = pd.read_csv('./llama-2-inference-512/predictions.csv')
df_predictions.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_predictions.columns]
df_predictions = df_predictions.T.reset_index()
df_predictions.iloc[0,1] = 'dataset'
df_predictions.columns = df_predictions.iloc[0,:]
df_predictions = df_predictions.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_predictions = df_predictions.set_index(['model','dataset'])
df_predictions = df_predictions.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
#pd.DataFrame(df_predictions.loc[('full','llama-7b-SFT_ds_eli5')])

FileNotFoundError: ignored

In [None]:
df_predictions.loc[('full','llama-7b-SFT_eli5_wiki65k')].to_frame()

Unnamed: 0_level_0,full
Unnamed: 0_level_1,llama-7b-SFT_eli5_wiki65k
0,Unnamed: 1_level_2
0.0,### Human: What was the purpose of Apollo 10?\...
1.0,"### Human: Who is Hervé Barulea, also known as..."
2.0,### Human: Who was Danny Murphy and what were ...
3.0,### Human: Who was David Azulai and when did h...
4.0,### Human: What is a song and what are some di...
...,...
95.0,### Human: Who is Michael Blunden?\n ### Assis...
96.0,### Human: What is a damson and how is it diff...
97.0,### Human: Who was Nat King Cole?\n ### Assist...
98.0,### Human: What is the official currency of Si...


In [None]:
df_rouge = pd.read_csv('./llama-2-7b-inference/rouge.csv')
df_rouge.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_rouge.columns]
df_rouge = df_rouge.T.reset_index()
df_rouge.iloc[0,1] = 'dataset'
df_rouge.columns = df_rouge.iloc[0,:]
df_rouge = df_rouge.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_rouge = df_rouge.set_index(['model','dataset'])
df_rouge = df_rouge.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
df_rouge.applymap(lambda x:np.round(eval(x),4))

Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1,rouge2,rougeL,rougeLsum
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
eli5,Llama-2-7b-hf,0.3796,0.2432,0.3,0.3222
eli5,llama-7b-SFT_ds_eli5,0.3701,0.214,0.2736,0.2821
eli5,llama-7b-SFT_ds_wiki65k,0.3575,0.2083,0.266,0.2762
eli5,llama-7b-SFT_eli5_wiki65k,0.3702,0.2126,0.2733,0.2819
wiki,Llama-2-7b-hf,0.1923,0.0103,0.0937,0.1392
wiki,llama-7b-SFT_ds_eli5,0.2203,0.0125,0.0964,0.1271
wiki,llama-7b-SFT_ds_wiki65k,0.1826,0.0073,0.0879,0.1165
wiki,llama-7b-SFT_eli5_wiki65k,0.1811,0.0076,0.0885,0.1153
full,Llama-2-7b-hf,0.1944,0.0087,0.0905,0.14
full,llama-7b-SFT_ds_eli5,0.2243,0.0118,0.0971,0.1312


In [None]:
df_bertscore = pd.read_csv('./llama-2-7b-inference/bertscore.csv')
df_bertscore.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_bertscore.columns]
df_bertscore = df_bertscore.T.reset_index()
df_bertscore.iloc[0,1] = 'dataset'
df_bertscore.columns = df_bertscore.iloc[0,:]
df_bertscore = df_bertscore.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_bertscore = df_bertscore.set_index(['model','dataset'])
df_bertscore = df_bertscore.swaplevel().unstack().loc[['eli5','wiki','full']].stack()
df_bertscore = df_bertscore[[col for col in df_bertscore.columns if 'hash' not in col]]
df_bertscore = df_bertscore.applymap(eval).applymap(np.mean)
df_bertscore.applymap(lambda x:np.round(x,4))

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eli5,Llama-2-7b-hf,0.8429,0.8754,0.8583
eli5,llama-7b-SFT_ds_eli5,0.8372,0.8799,0.8575
eli5,llama-7b-SFT_ds_wiki65k,0.8415,0.8796,0.8598
eli5,llama-7b-SFT_eli5_wiki65k,0.8399,0.8798,0.859
wiki,Llama-2-7b-hf,0.7879,0.8067,0.797
wiki,llama-7b-SFT_ds_eli5,0.7859,0.8093,0.7971
wiki,llama-7b-SFT_ds_wiki65k,0.7782,0.8019,0.7898
wiki,llama-7b-SFT_eli5_wiki65k,0.7783,0.8014,0.7896
full,Llama-2-7b-hf,0.794,0.8092,0.8014
full,llama-7b-SFT_ds_eli5,0.7933,0.8116,0.8022


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m81.9/105.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/2.0 MB[0m [31m25.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
pd.DataFrame(df_predictions.loc[('full','llama-7b-SFT_ds_eli5')])

In [None]:
df_predictions.applymap(fre).mean(axis=1)

dataset  model                    
eli5     Llama-2-7b-hf                58.2425
         llama-7b-SFT_ds_eli5         67.6357
         llama-7b-SFT_ds_wiki65k      58.7145
         llama-7b-SFT_eli5_wiki65k    65.6862
wiki     Llama-2-7b-hf                64.1040
         llama-7b-SFT_ds_eli5         72.5682
         llama-7b-SFT_ds_wiki65k      66.5824
         llama-7b-SFT_eli5_wiki65k    65.7558
full     Llama-2-7b-hf                65.0452
         llama-7b-SFT_ds_eli5         72.5278
         llama-7b-SFT_ds_wiki65k      66.8147
         llama-7b-SFT_eli5_wiki65k    65.8295
dtype: float64

In [None]:
df_predictions.applymap(fkg).mean(axis=1)

dataset  model                    
eli5     Llama-2-7b-hf                10.075
         llama-7b-SFT_ds_eli5          8.148
         llama-7b-SFT_ds_wiki65k      10.526
         llama-7b-SFT_eli5_wiki65k     8.704
wiki     Llama-2-7b-hf                 8.172
         llama-7b-SFT_ds_eli5          6.988
         llama-7b-SFT_ds_wiki65k       8.055
         llama-7b-SFT_eli5_wiki65k     8.239
full     Llama-2-7b-hf                 8.472
         llama-7b-SFT_ds_eli5          7.092
         llama-7b-SFT_ds_wiki65k       8.252
         llama-7b-SFT_eli5_wiki65k     8.417
dtype: float64

## Redo Inference for cleaned ELI5 & ELI5 + Wiki (OLD)

In [None]:
ds_cleaned = {}
ds_cleaned['full'] = datasets.load_from_disk('./data/ds_SFT_cleaned_edits')
ds_cleaned['eli5'] = ds_cleaned['full'].filter(lambda x: x['source']!='simple_wiki')

In [None]:
model_ids = []
model_ids.append((None,'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged'))

In [None]:
generate_df_predictions(model_ids,
                        ds_cleaned,
                        './llama-2-7b-inference-cleaned',
                        batch_size=2,
                        padding=True,
                        predictions_dir= './val_results_new_merge_eli5_cleaned')

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
import numpy as np

pred_dir = './llama-2-7b-inference-cleaned'

df_predictions = pd.read_csv(f'./{pred_dir}/predictions.csv')
df_predictions.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_predictions.columns]
df_predictions = df_predictions.T.reset_index()
df_predictions.iloc[0,1] = 'dataset'
df_predictions.columns = df_predictions.iloc[0,:]
df_predictions = df_predictions.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_predictions = df_predictions.set_index(['model','dataset'])
df_predictions = df_predictions.swaplevel().unstack().loc[['eli5','full']].stack()

pd.DataFrame(df_predictions.loc[
    ('eli5',
     'Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged')])

In [None]:
df_rouge = pd.read_csv(f'./{pred_dir}/rouge.csv')
df_rouge.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_rouge.columns]
df_rouge = df_rouge.T.reset_index()
df_rouge.iloc[0,1] = 'dataset'
df_rouge.columns = df_rouge.iloc[0,:]
df_rouge = df_rouge.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_rouge = df_rouge.set_index(['model','dataset'])
df_rouge = df_rouge.swaplevel().unstack().loc[['eli5','full']].stack()
df_rouge

In [None]:
df_bertscore = pd.read_csv(f'./{pred_dir}/bertscore.csv')
df_bertscore.columns = [col.split('.')[0].split('_1024')[0]
                    for col in df_bertscore.columns]
df_bertscore = df_bertscore.T.reset_index()
df_bertscore.iloc[0,1] = 'dataset'
df_bertscore.columns = df_bertscore.iloc[0,:]
df_bertscore = df_bertscore.iloc[1:,:].rename(columns={"Unnamed: 0":'model'})
df_bertscore = df_bertscore.set_index(['model','dataset'])
df_bertscore = df_bertscore.swaplevel().unstack().loc[['eli5','full']].stack()
df_bertscore = df_bertscore[[col for col in df_bertscore.columns if 'hash' not in col]]
df_bertscore.applymap(eval).applymap(np.mean)

In [None]:
df_predictions.applymap(fre).mean(axis=1)

dataset  model                                               
eli5     Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged            65.5466
         Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged    64.8685
full     Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged            66.0918
         Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged    68.1486
dtype: float64

In [None]:
df_predictions.applymap(fkg).mean(axis=1)

dataset  model                                               
eli5     Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged            8.897
         Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged    8.815
full     Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged            8.841
         Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged    7.926
dtype: float64

## combining datasets

In [None]:
model_id = "meta-llama/Llama-2-7b-hf"
model_name = model_id.split('/')[-1]
llama_tokenizer = AutoTokenizer.from_pretrained(model_id)
llama_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
SFT_QA_dataset_llama = SFT_QA_dataset.map(lambda x :
                                    llama_tokenizer(x['QA']))

SFT_QA_dataset_llama = SFT_QA_dataset_llama.map(lambda x: {'length':len(x['input_ids'])})

SFT_QA_dataset_llama.save_to_disk('../data/SFT_QA_dataset_llama')

In [None]:
with wandb.init(project='ELI5_analysis',
                entity='ft-llmmm',
                job_type='upload_data',
                name=f'llama_QA_tokenized_dataset_clean') as run:

    clean_data_art = wandb.Artifact('llama_QA_tokenized', 'dataset')
    clean_data_art.add_dir('../data/SFT_QA_dataset_llama')
    run.log_artifact(clean_data_art)

In [None]:
ds_llama = datasets.load_from_disk('../data/SFT_QA_dataset_llama')

In [None]:
SFT_QA_dataset_llama = datasets.load_from_disk('../data/SFT_QA_dataset_llama')

In [None]:
SFT_QA_dataset_llama_1024 = SFT_QA_dataset_llama.filter(lambda x:x['length']<=1024)
SFT_QA_dataset_llama_2048 = SFT_QA_dataset_llama.filter(lambda x:x['length']<=2048)

In [None]:
SFT_QA_dataset_llama_1024.save_to_disk('../data/llama_tokenized_1024')
SFT_QA_dataset_llama_2048.save_to_disk('../data/llama_tokenized_2048')

In [None]:
with wandb.init(project='ELI5_analysis',
                entity='ft-llmmm',
                job_type='upload_data',
                name=f'llama_QA_tokenized_dataset_clean_short') as run:

    clean_data_art_1024 = wandb.Artifact('llama_QA_tokenized_1024', 'dataset')
    clean_data_art_1024.add_dir('../data/llama_tokenized_1024')
    run.log_artifact(clean_data_art_1024)

    clean_data_art_2048 = wandb.Artifact('llama_QA_tokenized_2048', 'dataset')
    clean_data_art_2048.add_dir('../data/llama_tokenized_2048')
    run.log_artifact(clean_data_art_2048)