# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/LLMs/Fine-tuning/SFT

# installations

!pip install bitsandbytes==0.41.1
!pip install safetensors>=0.3.1
!pip install trl
!pip install wandb
!pip install tokenizers>=0.13.3
!pip install accelerate==0.21.0
!pip install datasets
!pip install -U torch
!pip install evaluate
!pip install rouge_score
!pip install nltk
!pip install bert_score
!pip install huggingface_hub
!pip install textstat --quiet
!pip install openai


!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/transformers.git

In [68]:
from huggingface_hub import login  # Import login function from huggingface_hub
from collections import defaultdict  # Import defaultdict from collections module
import transformers  # Import transformers library
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM)
from tqdm import tqdm  # Import tqdm for progress bars
from peft import PeftModel  # Import PeftModel from peft library
import pickle  # Import pickle for serialization and deserialization
import os  # Import os for file operations
import pandas as pd  # Import pandas for data manipulation
from transformers import pipeline  # Import pipeline from transformers library
import numpy as np # import numpy using standard alias.
from textstat import flesch_reading_ease as fre # fre and fkg are shorthands for the Flesch Readability Ease
from textstat import flesch_kincaid_grade as fkg # and Flesch-Kincaid readability, respectively.
from pprint import pprint # used to display text in a more readable format.
import openai # Used to access GPT-4, which will judge the model's responses.
import time
import torch
from getpass import getpass
import evaluate
import wandb
import datasets

# Computing Predictions

In [None]:
def inference_formatting(example):
    """
    Formats a given example for inference by adding conversation headers.

    Args:
        example (str): The input example.

    Returns:
        str: The formatted example with conversation headers.

    Example:
        >>> inference_formatting("How does photosynthesis work?")
        "### Human: How does photosynthesis work?\n ### Assistant:"
    """
    return f"### Human: {example}\n ### Assistant:"

def generate_examples(model,
                      tokenizer,
                      data,
                      num_beams=1,
                      do_sample=True,
                      temperature=0.6,
                      top_p=0.9,
                      repetition_penalty=1.2,
                      padding=True,
                      max_new_tokens=512):
    """
    Generates responses using a given model and tokenizer with configurable generation settings.

    Args:
        model (PreTrainedModel): The pre-trained model for generation.
        tokenizer (PreTrainedTokenizerBase): The tokenizer for encoding and decoding text.
        data (dict): The input data with prompts.
        num_beams (int, optional): Number of beams for beam search. Default is 1.
        do_sample (bool, optional): Whether to use sampling for generation. Default is True.
        temperature (float, optional): The temperature for sampling. Default is 0.6.
        top_p (float, optional): Top p value for nucleus sampling. Default is 0.9.
        repetition_penalty (float, optional): Penalty for generating repeating tokens. Default is 1.2.
        padding (bool, optional): Whether to apply padding during tokenization. Default is True.
        max_new_tokens (int, optional): Maximum number of tokens to generate. Default is 512.

    Returns:
        list of str: The list of generated responses.

    Example:
        >>> generate_examples(model, tokenizer, {'prompt': ['How does photosynthesis work?']})
        ["Photosynthesis is the process by which green plants, algae, and some bacteria convert carbon dioxide and water into..."]
    """
    generation_config = transformers.GenerationConfig(
        num_beams=num_beams,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        pad_token_id=model.config.eos_token_id
    )

    prompts = data['prompt']
    input = tokenizer(prompts, return_tensors='pt', padding=padding).to('cuda')
    output_ids = model.generate(
        input_ids=input['input_ids'],
        attention_mask=input['attention_mask'],
        generation_config=generation_config,
    )

    predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]

    return predictions

def generate_df_predictions(model_ids,
                            ds,
                            output_dir,
                            batch_size=16,
                            seed=50,
                            size=100,
                            padding=True,
                            predictions_dir='./inference_results'):
    """
    Generates and evaluates predictions for a set of models on specified datasets.

    Args:
        model_ids (list): List of tuples containing base model and model ID.
        ds (dict): Dictionary of datasets.
        output_dir (str): Directory to save Pandas dataframe containing predictions.
        batch_size (int, optional): Batch size for prediction. Default is 16.
        seed (int, optional): Seed for random operations. Default is 50.
        size (int, optional): Size of the dataset subset for evaluation. Default is 100.
        padding (bool, optional): Whether to apply padding during tokenization. Default is True.
        predictions_dir (str, optional): Directory to save pickled prediction results. Default is './inference_results'.

    Returns:
        None

    Example:
        >>> generate_df_predictions(model_ids, datasets, 'output_dir')
    """

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(predictions_dir, exist_ok=True)
    rouge = evaluate.load('rouge')  # Load ROUGE evaluation tool
    bertscore = evaluate.load("bertscore")  # Load BERTScore evaluation tool

    # ds_small will contain small versions of the validation subsets.
    ds_small = {}
    # predictions will contain predictions associated to each element of validation set.
    predictions = defaultdict(list)

    for base_model, model_id in model_ids:
        print(f'working on model {model_id.split("/")[-1]}')
        model_name = model_id.split('/')[-1]

        # If we already ran inference of model on all datasets, continue.
        if all(os.path.exists(f'{predictions_dir}/{model_name}_{ds_name}.pkl')\
               for ds_name in ds):

            for ds_name in ds:
                file_pkl = f'{predictions_dir}/{model_name}_{ds_name}.pkl'
                with open(file_pkl, 'rb') as f:
                    predictions[model_name, ds_name] = pickle.load(f)
            continue

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        # Set pad_token to eos token if it does not exist.
        if not tokenizer.pad_token:
            tokenizer.pad_token = tokenizer.eos_token
        # Set padding side to left since we are doing inference.
        # Setting padding to right will hurt model's performance.
        tokenizer.padding_side = "left"

        # if base model is given we download it and then attach adapter layers.
        if base_model:
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                device_map="auto",
                torch_dtype=torch.bfloat16
            )

            model = PeftModel.from_pretrained(model=model,
                                             model_id=model_id,
                                             torch_dtype=torch.bfloat16,
                                             is_trainable=False)
        # If no base model given, download model directly.
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                torch_dtype=torch.bfloat16,
            )

        # Ensure model is in inference mode.
        model.eval()

        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # If model can be moved, move it to device.
        # Need try/except block since bitsandbytes models cannot be moved,
        try:
            model.to(device)
        except:
            pass


        for ds_name in ds:
            # For each dataset we turn the question into a prompt, shuffle the dataset, and only keep "size" rows.
            ds_small = ds[ds_name]['validation'].map(lambda x: {'prompt': inference_formatting(x['question'])})
            ds_small = ds_small.shuffle(seed=seed)
            ds_small = ds_small.select(range(size))

            print(f'working on dataset {ds_name}')

            # file_pkl will contain results of running inference using "model_name" on the given dataset.
            # Useful to save intermediate results to avoid unnecessary repeated computations.

            # If file exists, load results into predictions dictionary.
            file_pkl = f'{predictions_dir}/{model_name}_{ds_name}.pkl'
            if os.path.exists(file_pkl):
                with open(file_pkl, 'rb') as f:
                    predictions[model_name, ds_name] = pickle.load(f)
            # Else, run inference over the dataset and save results.
            # Use batched inference to speed up computations.
            else:
                for k in tqdm(range(0, len(ds_small), batch_size)):
                    prediction = generate_examples(model, tokenizer, ds_small[k:k + batch_size], padding=padding)
                    predictions[model_name, ds_name].extend(prediction)

                    with open(f'{predictions_dir}/{model_name}_{ds_name}.pkl', 'wb') as f:
                        pickle.dump(predictions[model_name, ds_name], f)

            rouge_scores = {}
            bert_scores = {}

        del model

    # For each model and dataset, we compute the ROUGE and BERTScore.
    for model_name, ds_name in predictions:
        print(f'computing predictions for {(model_name, ds_name)}')

        # load predictions
        preds = predictions[(model_name, ds_name)]

        # Output for predictions, ROUGE scores and BERTScores.
        preds_file = output_dir + f'/{model_name}_{ds_name}_predictions.csv'
        rouge_file = output_dir + f'/{model_name}_{ds_name}_rouge.csv'
        bertscore_file = output_dir + f'/{model_name}_{ds_name}_bertscore.csv'

        # If file does not already exist, compute the ROUGE scores.
        if not os.path.exists(rouge_file):

            rouge_scores[(model_name, ds_name)] = rouge.compute(
                predictions=preds,
                references=ds_small['QA']
            )
            df_rouge = pd.DataFrame(rouge_scores[(model_name, ds_name)],
                                    index=[0])
            df_rouge.to_csv(rouge_file)

        # Same as above but for BERTScore
        if not os.path.exists(bertscore_file):

            bert_scores[(model_name, ds_name)] = bertscore.compute(
                predictions=preds,
                references=ds_small['QA'],
                lang='en')

            df_bert = pd.DataFrame(bert_scores[(model_name, ds_name)],
                                   )
            df_bert.to_csv(bertscore_file)

        # Convert predictions to Pandas Dataframe and save.
        df_preds = pd.DataFrame(preds)
        df_preds.to_csv(preds_file, index=True)


In [None]:
# Load artifacts containing datasets.

with wandb.init(project='SFT_training_DM',
                entity='ft-llmmm',
                job_type='download_data',
                name=f'download_combined_data') as run:

    artifact = run.use_artifact('ft-llmmm/ELI5_analysis/llama_QA_tokenized_1024:v1', type='dataset')
    artifact_dir = artifact.download()

# We have three datasets, the combined SFT dataset, and two datasets containing just questions from the
# ELI5 and Simple Wikipedia datasets.
ds = {}
ds['full'] = datasets.load_from_disk(artifact_dir)
ds['wiki'] = ds['full'].filter(lambda x: x['source']=='simple_wiki')
ds['eli5'] = ds['full'].filter(lambda x: x['source']!='simple_wiki')

In [None]:
# Here we run inference over the original Llama-2-7B model as well as our 5 fine-tuned models.
# Note: models with "eli5-cleaned" in them were trained on the ELI5 dataset where we removed posts that contained "edit:" in them.
# Models with just "eli5" were trained on the dataset with those posts left in.
# Models trained on the uncleaned dataset are more likely to end their answers with "edit: caught typos", which we want to avoid.

model_ids = []
model_ids.append((None,'meta-llama/Llama-2-7b-hf'))

model_ids.append((None,'dhmeltzer/llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged'))

model_ids.append((None,'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged'))

# Run inference for all models.
generate_df_predictions(model_ids,
                        ds,
                        './llama-2-inference-512',
                        batch_size=8,
                        padding=True,
                        predictions_dir = './val_results_512')

In [None]:
# Same as above, but now for the 13B models.

model_ids = []
model_ids.append((None,'meta-llama/Llama-2-13b-hf'))

model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged'))

model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged'))
model_ids.append((None,'dhmeltzer/Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged'))

generate_df_predictions(model_ids,
                        ds,
                        './llama-2-inference-512',
                        batch_size=4,
                        padding=True,
                        predictions_dir = './val_results_512')

# Analyzing Results

## Definitions

The functions defined below are used to clean up the Pandas dataframes so they are easier to read and understand.

In [75]:
def fix_df_predictions(model_names,
                       pred_directory='llama-2-inference-512'):
    """
    Concatenates and formats prediction data from multiple models and datasets.

    Args:
        model_names (list): List of model names.
        pred_directory (str, optional): Directory containing prediction files. Default is 'llama-2-inference-512'.

    Returns:
        pd.DataFrame: Concatenated and formatted prediction data.

    Example:
        >>> fix_df_predictions(['model1', 'model2'])
    """
    df_predictions = pd.DataFrame()  # Initialize an empty DataFrame to store the formatted predictions.

    for model_name in model_names:
        for ds_name in ['full', 'wiki', 'eli5']:

            predictions_file = f'./{pred_directory}/{model_name}_{ds_name}_predictions.csv'  # Define the path to the prediction file.

             # temporary dataframe containing prediction data for given model and dataset.
            temp = pd.read_csv(predictions_file, index_col='Unnamed: 0')

            # Transpose the DataFrame for proper formatting.
            temp = temp.T
            # Add a column for the model name.
            temp['model_name'] = model_name
            # Add a column for the dataset name.
            temp['dataset'] = ds_name

            # Concatenate the formatted data with the existing DataFrame.
            df_predictions = pd.concat([df_predictions, temp])

    # Set the model name and dataset as the multi-index.
    df_predictions = df_predictions.set_index(['model_name', 'dataset'])

    return df_predictions

def fix_df_metric(model_names, metric, pred_directory='llama-2-inference-512'):
    """
    Concatenates and formats metric data from multiple models and datasets.

    Args:
        model_names (list): List of model names.
        metric (str): Name of the metric (either 'bertscore' or 'rouge').
        pred_directory (str, optional): Directory containing metric files. Default is 'llama-2-inference-512'.

    Returns:
        pd.DataFrame: Concatenated and formatted metric data.

    Raises:
        ValueError: If metric is not 'bertscore' or 'rouge'.

    Example:
        >>> fix_df_metric(['model1', 'model2'], 'bertscore')
    """
    df = pd.DataFrame()  # Initialize an empty DataFrame to store the formatted metric data.

    if metric not in ['bertscore', 'rouge']:
        raise ValueError('metric must be either bertscore or rouge')  # Raise an error if the metric is not valid.

    for model_name in model_names:
        for ds_name in ['full', 'wiki', 'eli5']:
            # Define the path to the metric file for each model and dataset.
            file_name = f'./{pred_directory}/{model_name}_{ds_name}_{metric}.csv'

            # Load the metric data into a temporary dataframe.
            temp = pd.read_csv(file_name, index_col='Unnamed: 0')

            # Calculate the mean of precision, recall, and f1 score for BERTScore.
            if metric == 'bertscore':
                temp = pd.DataFrame(temp[['precision', 'recall', 'f1']].mean()).T

            temp['model_name'] = model_name  # Add a column for the model name.
            temp['dataset'] = ds_name  # Add a column for the dataset name.

            df = pd.concat([df, temp])  # Concatenate the formatted data with the main DataFrame.

    df = df.set_index(['model_name', 'dataset'])  # Set the model name and dataset as the index.

    return df  # Return the formatted DataFrame.


## Automatic Metrics

In this section we will use automatic metrics to measure our models. Specifically, we will use ROUGE and BERTScore to evaluate how similar the model's answers are to the original human's answer. We will also use the Flesch readability metrics to see which answers are "simpler".

In [55]:
model_names_7B = []
model_names_7B.append('Llama-2-7b-hf')

model_names_7B.append('llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged')
model_names_7B.append('llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged')
model_names_7B.append('llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged')

model_names_7B.append('Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged')
model_names_7B.append('Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged')

df_predictions_7B = fix_df_predictions(model_names_7B)
df_bertscore_7B = fix_df_metric(model_names_7B,'bertscore')
df_bertscore_7B = fix_df_metric(model_names_7B,'rouge')

In [87]:
# Below we see that the trained models generally have a lower Flesch-Kincaid grade level than the original model.
# The two exceptions are the models trained on the ELI5-cleaned and wikipedia dataset evaluated on the ELI5 dataset.
df_predictions_7B.applymap(fkg).mean(axis=1).to_frame('fkg').swaplevel('model_name','dataset').sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,fkg
dataset,model_name,Unnamed: 2_level_1
eli5,Llama-2-7b-hf,10.799
eli5,Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged,11.015
eli5,Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged,9.635
eli5,llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged,8.509
eli5,llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged,13.777
eli5,llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged,9.237
full,Llama-2-7b-hf,11.636
full,Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged,11.27
full,Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged,10.054
full,llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged,7.855


In [86]:
# Below we see that the trained models a higher Flesch readability level than the original model.
# The one exception is that the model trained on just wikipedia has a lower score on the ELI5 dataset.

df_predictions_7B.applymap(fre).mean(axis=1).to_frame('fre').swaplevel('model_name','dataset').sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,fre
dataset,model_name,Unnamed: 2_level_1
eli5,Llama-2-7b-hf,52.677
eli5,Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged,55.2385
eli5,Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged,61.0496
eli5,llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged,65.001
eli5,llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged,44.7725
eli5,llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged,61.9205
full,Llama-2-7b-hf,48.902
full,Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged,54.0539
full,Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged,57.969
full,llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged,68.6513


In [58]:
model_names_13B = ['Llama-2-13b-hf',
 'Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged',
 'Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged',
 'Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged',
 'Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged',
 'Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged']

df_predictions_13B = fix_df_predictions(model_names_13B)
df_bertscore_13B = fix_df_metric(model_names_13B,'bertscore')
df_bertscore_13B = fix_df_metric(model_names_13B,'rouge')

In [85]:
# For the 13B mdoels we see more examples where the fine-tuned models have a higher Flesch-Kincaid grade level.
# The exceptions are:
#   1) The models trained on just ELI5 or just simple wikipedia evaluated on the ELI5 validation set.
#   2) The models trained on just simple wikipedia or ELI5-cleaned and evalauted on the full dataset.
#   3) The model trained on just simple wikipedia and evaluated on the simple wikipedia validation set.

df_predictions_13B.applymap(fkg).mean(axis=1).to_frame('fkg').swaplevel('model_name','dataset').sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,fkg
dataset,model_name,Unnamed: 2_level_1
eli5,Llama-2-13b-hf,9.932
eli5,Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged,10.142
eli5,Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged,13.763
eli5,Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged,8.907
eli5,Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged,9.298
eli5,Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged,8.433
full,Llama-2-13b-hf,10.028
full,Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged,8.895
full,Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged,11.296
full,Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged,9.339


In [88]:
# Here we see that most 13B models have a higher Flesch readability scores.
# The exception is the model trained on just the Simple Wikipedia dataset and evaluated on all three datasets.

df_predictions_13B.applymap(fre).mean(axis=1).to_frame('fre').swaplevel('model_name','dataset').sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,fre
dataset,model_name,Unnamed: 2_level_1
eli5,Llama-2-13b-hf,55.9093
eli5,Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged,55.9616
eli5,Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged,43.5825
eli5,Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged,62.4666
eli5,Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged,61.6679
eli5,Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged,64.5636
full,Llama-2-13b-hf,55.5964
full,Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged,63.1428
full,Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged,52.3647
full,Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged,61.2361


## GPT-4 As Judge

In this section we will use GPT-4 as a judge to score each model's answer. Inspired by [MT-bench](https://arxiv.org/abs/2306.05685) we use the following prompt:



> [Instruction]\nPlease act as an impartial judge and
    evaluate the quality of the response provided by an AI assistant
    to the user question displayed below. Your evaluation should consider
    factors such as the following:\n\n- **Simplicity**: Is the response
    clear and straightforward enough for middle or high school students to
    understand?\n- **Helpfulness**: Does the response effectively address
    the query?\n- **Relevance**: Does the response directly pertain to the
    question?\n- **Accuracy**: Is the response factually correct?\n-
    **Depth & Creativity**: Assess richness without favoring excessive
    detail.\n\nAim for a holistic assessment. Begin your evaluation by
    providing a short explanation. Be as objective as possible. After
    providing your explanation, you must rate the response on a scale
    of 1 to 10 by strictly following this format: \'[[rating]]\',
    for example: \'Rating: [[5]]\'.\n\n[Question]\n{question}\n\n
    [The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's
    Answer]

This prompt was chosen to emphasize both helpfulness, i.e. that the model actually answers the question accurately, while also being simple enough for students to understand.

In [None]:
# To access GPT-4, we need an OpenAI access token.
os.environ['OPENAI_API_KEY'] = getpass('Enter OpenAI access token: ')

Installing collected packages: openai
Successfully installed openai-0.28.0


In [70]:
def find_rating(string):
    """
    Find the rating value from a string.

    Args:
        string (str): Input string containing the rating.

    Returns:
        str: The extracted rating.

    Example:
        >>> find_rating("Rating: [[5]]")
        '5'
    """
    idx = string.index('[[')
    return string[idx+2]

def get_GPT_judgement(QA_pair,
                      model='gpt-4',
                      temperature=0,
                      max_tokens=2048,
                      system_message='You are a helpful assistant.',
                      API_MAX_RETRY=16,
                      API_RETRY_SLEEP=10,
                      API_ERROR_OUTPUT="$ERROR$"):
    """
    Get a GPT-based judgment on a given QA pair.

    Args:
        QA_pair (str): Input QA pair formatted as '### Human: ... ### Assistant: ...'.
        model (str, optional): OpenAI GPT model name. Defaults to 'gpt-4'.
        temperature (int, optional): Temperature parameter for generating responses. Defaults to 0.
        max_tokens (int, optional): Maximum number of tokens in the generated response. Defaults to 2048.
        system_message (str, optional): System message for the conversation. Defaults to 'You are a helpful assistant.'.
        API_MAX_RETRY (int, optional): Maximum number of API retries in case of error. Defaults to 16.
        API_RETRY_SLEEP (int, optional): Sleep duration (in seconds) between API retries. Defaults to 10.
        API_ERROR_OUTPUT (str, optional): Output string in case of API error. Defaults to "$ERROR$".

    Returns:
        str: GPT-generated judgment on the given QA pair.

    Example:
        >>> get_GPT_judgement("### Human: What is the capital of France? ### Assistant: Paris.",
        ...                   model='gpt-4', temperature=0.6)
        'A good answer, clear and helpful. [Rating: [[8]]]'
    """
    QA_pair = QA_pair.split('### Human:')[1].split('### Assistant:')
    QA_pair = list(map(lambda x:x.strip(),QA_pair))

    question = QA_pair[0]
    answer = QA_pair[1]

    prompt = f"""[Instruction]\nPlease act as an impartial judge and
    evaluate the quality of the response provided by an AI assistant
    to the user question displayed below. Your evaluation should consider
    factors such as the following:\n\n- **Simplicity**: Is the response
    clear and straightforward enough for middle or high school students to
    understand?\n- **Helpfulness**: Does the response effectively address
    the query?\n- **Relevance**: Does the response directly pertain to the
    question?\n- **Accuracy**: Is the response factually correct?\n-
    **Depth & Creativity**: Assess richness without favoring excessive
    detail.\n\nAim for a holistic assessment. Begin your evaluation by
    providing a short explanation. Be as objective as possible. After
    providing your explanation, you must rate the response on a scale
    of 1 to 10 by strictly following this format: \'[[rating]]\',
    for example: \'Rating: [[5]]\'.\n\n[Question]\n{question}\n\n
    [The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's
    Answer]"""

    output = API_ERROR_OUTPUT

    # code block below queries GPT model to complete the prompt a fixed number of times, API_MAX_RETRY.
    # If not successful, the code returns API_ERROR_OUTPUT.
    for _ in range(API_MAX_RETRY):
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": prompt}
                    ],
                n=1,
                temperature=temperature,
                max_tokens=max_tokens,
            )
            output = response["choices"][0]["message"]["content"]
            break
        except openai.error.OpenAIError as e:
            print(type(e), e)
            time.sleep(API_RETRY_SLEEP)

    return output

def GPT_judgement_on_df(df_predictions, output_file, model_engine='gpt-4'):
    """
    Generate GPT-based judgments on a DataFrame of predictions.

    Args:
        df_predictions (pd.DataFrame): DataFrame containing predictions.
        output_file (str): Path to the output file for storing judgments.
        model_engine (str, optional): OpenAI GPT model name. Defaults to 'gpt-4'.

    Returns:
        None

    Example:
        >>> GPT_judgement_on_df(df_predictions, 'output_judgements.csv', model_engine='gpt-4')
    """
    # Check if output file exists
    if os.path.exists(output_file):
        df_GPT4_judgements = pd.read_csv(output_file, index_col='Unnamed: 0')
    else:
        df_GPT4_judgements = defaultdict(list)

    # Get unique model names from the predictions
    model_names = sorted(list(set([idx[0] for idx in df_predictions.index])))

    # Iterate over model names
    for model_name in model_names:
        print(f'working on model {model_name}')
        # Iterate over samples
        for j in tqdm(range(df_predictions.shape[1])):
            QA_pair = df_predictions.loc[(model_name, 'full')][j]
            # Check if judgment already exists for this sample
            if j not in df_GPT4_judgements[model_name]:
                # Get GPT-based judgment
                df_GPT4_judgements[model_name][j] = get_GPT_judgement(QA_pair, model=model_engine)

    # Save judgments to output file
    df_GPT4_judgements.to_csv(output_file)


In [71]:
GPT_judgement_on_df(df_predictions_7B,
                        './llama-2-inference-512/GPT4_7B_judgements.csv')
GPT_judgement_on_df(df_predictions_13B,
                        './llama-2-inference-512/GPT4_13B_judgements.csv')

working on model Llama-2-7b-hf


100%|██████████| 100/100 [00:00<00:00, 4361.62it/s]


working on model Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged


100%|██████████| 100/100 [00:00<00:00, 4945.01it/s]


working on model Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged


100%|██████████| 100/100 [00:00<00:00, 5081.29it/s]


working on model llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged


100%|██████████| 100/100 [00:00<00:00, 4030.27it/s]


working on model llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged


100%|██████████| 100/100 [00:00<00:00, 4085.59it/s]


working on model llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged


100%|██████████| 100/100 [00:00<00:00, 4382.21it/s]


working on model Llama-2-13b-hf


100%|██████████| 100/100 [00:00<00:00, 4526.75it/s]


working on model Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged


100%|██████████| 100/100 [00:00<00:00, 3881.14it/s]


working on model Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged


100%|██████████| 100/100 [00:00<00:00, 2495.15it/s]


working on model Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged


100%|██████████| 100/100 [00:00<00:00, 1918.42it/s]


working on model Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged


100%|██████████| 100/100 [00:00<00:00, 2182.00it/s]


working on model Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged


100%|██████████| 100/100 [00:00<00:00, 1923.41it/s]


In [72]:
df_GPT_4_judgements_7B = pd.read_csv('./llama-2-inference-512/GPT4_7B_judgements.csv',
                                  index_col='Unnamed: 0')
df_GPT_4_judgements_13B = pd.read_csv('./llama-2-inference-512/GPT4_13B_judgements.csv',
                                  index_col='Unnamed: 0')

In [73]:
# At 7B we see the best performing model s the one trained on the combined ELI5-clenaed + simple wikipedia dataset.

scores_GPT4_7B = df_GPT_4_judgements_7B.T.applymap(find_rating).applymap(int).mean(axis=1)
sorted(list(scores_GPT4_7B.items()),key = lambda x:x[1],reverse=True)

[('Llama-2-7b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 3.89),
 ('llama-7b-SFT_ds_wiki65k_1024_r_64_alpha_16_merged', 3.58),
 ('llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16_merged', 3.45),
 ('Llama-2-7b-hf', 2.92),
 ('Llama-2-7b-hf-eli5-cleaned-1024_qlora_merged', 2.58),
 ('llama-7b-SFT_ds_eli5_1024_r_64_alpha_16_merged', 2.43)]

In [74]:
# At 13B we see the best performing model s the one trained on the original ELI5 + simple wikipedia dataset.
# The model trained on ELI-5 Cleaned + Simple Wikipedia is not far behind though.

scores_GPT4_13B = df_GPT_4_judgements_13B.T.applymap(find_rating).applymap(int).mean(axis=1)
sorted(list(scores_GPT4_13B.items()),key = lambda x:x[1],reverse=True)

[('Llama-2-13b-hf-eli5-wiki-1024_r_64_alpha_16_merged', 4.91),
 ('Llama-2-13b-hf-eli5-cleaned-wiki65k-1024_qlora_merged', 4.77),
 ('Llama-2-13b-hf-ds_wiki_1024_full_r_64_alpha_16_merged', 4.53),
 ('Llama-2-13b-hf', 3.43),
 ('Llama-2-13b-hf-eli5-cleaned-1024_qlora_merged', 3.27),
 ('Llama-2-13b-hf-ds_eli5_1024_r_64_alpha_16_merged', 3.17)]