<a href="https://colab.research.google.com/github/fghaffar/AssistAF/blob/main/GPT_4o_mini_Fine_Tune_Legal_Bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Packages and Imports

**NOTE** This Colab Notebook assumes your OpenAI Key is stored in your Colab Secrets. You can implement this any way you like, but if you want to live dangerously and hardcode the key, make a duplicate of this Notebook first.

In [None]:
!pip install -q huggingface_hub openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/360.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.7/360.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import hf_hub_download
import shutil
import zipfile
import os
import json
import pandas as pd
from openai import OpenAI
from google.colab import userdata # This assumes your OpenAI API Key is stored in your Colab Secrets

In [None]:
## Assign your key here

openai_key = userdata.get('your key here')

## 🤗 Hugging Face, 🍥 OpenAI and 🐼 Pandas code we will use

This installs huggingface, as well as several utilities we will us later. The main function just grabs dataset files from HF, and there is also an unzip utility which we aren't using, but is often needed depending on the files retrieved.

In [None]:
# Helper Functions for fetching huggingface repos and unzipping.

def fetch_files(repo_id, repo_type, file_dict):
    """
    Fetch specified files from the Hugging Face Hub repository to the /content/ directory, ensuring actual files are moved.

    :param repo_id: Repository ID on Hugging Face Hub (e.g., "username/repo_name").
    :param file_dict: A dictionary where keys are filenames in the repository and values are desired new filenames.
    """
    for original_filename, new_filename in file_dict.items():
        try:
            # Download the file and ensure no symlink is created in the process
            file_path = hf_hub_download(repo_id=repo_id, filename=original_filename, repo_type=repo_type, local_dir_use_symlinks=False)

            # Resolve the absolute path if file_path is a symlink
            resolved_file_path = os.path.realpath(file_path)

            # Define the destination path in /content/
            dest_path = f'/content/{new_filename}'

            # If the destination file already exists, remove it
            if os.path.exists(dest_path):
                os.remove(dest_path)

            # Move the actual file to the destination
            shutil.move(resolved_file_path, dest_path)
            print(f'Moved: {original_filename} to {dest_path}')
        except Exception as e:
            print(f"Error downloading or moving {original_filename}: {e}")

def unzip_file(zip_path):
    """
    Unzips a file to the same directory where the zip file is located.

    :param zip_path: The full path to the zip file.
    """
    # Ensure the zip file exists
    if not os.path.exists(zip_path):
        print(f"The file {zip_path} does not exist.")
        return
    extract_dir = os.path.dirname(zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_path} to {extract_dir}")

# Example usage:
# Let's assume the zip file is located in /content/new_data.zip
# unzip_file('/content/new_data.zip')

## Code for Creating Training and Test sets from HuggingFace files

def create_training_data(df, yes_count, no_count):
    # Strip any leading/trailing whitespace from the 'answer' column
    df['answer'] = df['answer'].str.strip()

    # Separate "Yes" and "No" examples based on the answer column
    yes_examples = df[df['answer'] == 'Yes']
    no_examples = df[df['answer'] == 'No']

    # Ensure we have enough examples to sample from
    if len(yes_examples) < yes_count or len(no_examples) < no_count:
        raise ValueError("Not enough examples to meet the requested counts")

    # Randomly sample the required number of "Yes" and "No" examples
    sampled_yes_examples = yes_examples.sample(n=yes_count)
    sampled_no_examples = no_examples.sample(n=no_count)

    # Combine the sampled examples
    combined_examples = pd.concat([sampled_yes_examples, sampled_no_examples])

    # Create the training data format
    training_data = []
    for _, row in combined_examples.iterrows():
        example = {
            "messages": [
                {"role": "system", "content": row['task']},
                {"role": "user", "content": row['text']},
                {"role": "assistant", "content": row['answer']}
            ]
        }
        training_data.append(example)

    return training_data

def create_test_data(df, yes_count, no_count):
    # Strip any leading/trailing whitespace from the 'answer' column
    df['answer'] = df['answer'].str.strip()

    # Separate "Yes" and "No" examples based on the answer column
    yes_examples = df[df['answer'] == 'Yes']
    no_examples = df[df['answer'] == 'No']

    # Ensure we have enough examples to sample from
    if len(yes_examples) < yes_count or len(no_examples) < no_count:
        raise ValueError("Not enough examples to meet the requested counts")

    # Randomly sample the required number of "Yes" and "No" examples
    sampled_yes_examples = yes_examples.sample(n=yes_count)
    sampled_no_examples = no_examples.sample(n=no_count)

    # Combine the sampled examples
    combined_examples = pd.concat([sampled_yes_examples, sampled_no_examples])

    # Prepend "Clause: " to each text
    combined_examples['text'] = "Clause: " + combined_examples['text']

    return combined_examples

# LLM API Code

def gpt_messages(system_message, user_message):
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

### Model names: gpt-4o-mini, gpt-4o
def call_gpt(chat_history, response_format={'type': 'text'}, model='gpt-4o-mini', temperature=0.0):
    client = OpenAI(api_key = openai_key)
    attempts = 0

    while attempts < 5:
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=chat_history,
                temperature=temperature,
                response_format=response_format,
            )
            return completion

        except Exception as e:
            print(f"Encountered an API error: {e}")
            attempts += 1

    print("Maximum retry attempts reached. Unable to complete the request.")
    return None

cc_qa_prompt_template = """{contract}

Question: {question} Answer Yes or No.
Answer:"""

# Async Test Running Code

import nest_asyncio
nest_asyncio.apply()

import asyncio
from concurrent.futures import ThreadPoolExecutor

def make_llm_calls(contract, question, system_message, model='gpt-4o-mini'):
    user_message = cc_qa_prompt_template.format(contract=contract, question=question)
    chat_history = gpt_messages(system_message=system_message, user_message=user_message)
    completion = call_gpt(chat_history, model=model)
    if completion:
        answer = completion.choices[0].message.content
    else:
        answer = "Error"
    return answer

def process_single_row(row, source_column, question_column, few_shot_prompt, model='gpt-4o-mini'):
    contract = row[source_column]
    question = row[question_column]
    answer = make_llm_calls(contract, question, few_shot_prompt, model=model)
    return answer

async def process_row_async(loop, executor, row, source_column, question_column, few_shot_prompt, model='gpt-4o-mini'):
    result = await loop.run_in_executor(executor, process_single_row, row, source_column, question_column, few_shot_prompt, model)
    return result

async def run_test_set_async(test_df, source_column, question_column, shot_prompt, few_shot, model='gpt-4o-mini', start_index=0, end_index=10, max_concurrency=5):
    # Determine the answer column name based on the few_shot parameter
    answer_column_suffix = '_few_shot_answer' if few_shot else '_zero_shot_answer'
    answer_column = f'{model}{answer_column_suffix}'

    # Ensure the column for the model's answers exists
    if answer_column not in test_df.columns:
        test_df[answer_column] = None

    loop = asyncio.get_event_loop()
    executor = ThreadPoolExecutor(max_workers=max_concurrency)

    tasks = []
    for index in range(start_index, end_index):
        if index >= len(test_df):
            break
        row = test_df.iloc[index]
        task = asyncio.ensure_future(process_row_async(loop, executor, row, source_column, question_column, shot_prompt, model))
        tasks.append((index, task))

    results = await asyncio.gather(*(task for _, task in tasks))

    # Update the DataFrame with the results
    for (index, _), result in zip(tasks, results):
        test_df.at[index, answer_column] = result

    return test_df

def score_test_results(df, answer_column='answer', model_answer_column='gpt-4o-mini_answer'):
    # Initialize counters for correct and total answers
    correct_answers = 0
    total_answers = len(df)

    # Convert the model answer column to strings and replace NaNs with empty strings
    df[model_answer_column] = df[model_answer_column].astype(str).fillna('')

    # Iterate over each row and compare the answers
    for index, row in df.iterrows():
        correct_answer = str(row[answer_column]).strip().lower()
        model_answer = str(row[model_answer_column]).strip().lower()

        # Check if the answers match
        if correct_answer == model_answer:
            correct_answers += 1

    # Calculate the normalized score out of 100
    score = (correct_answers / total_answers) * 100
    print("Answers for", model_answer_column)

    print(f"Correct Answers: {correct_answers}")
    print(f"Total Answers: {total_answers}")
    print(f"Normalized Score: {score:.2f}")

    return score

## OpenAI's Provided formatting checker for Fine-tuning data format validation

import json
from collections import defaultdict

def check_finetune_dataset(data_path):
    # Load the dataset
    with open(data_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    # Initial dataset stats
    print("Num examples:", len(dataset))
    print("First example:")
    for message in dataset[0]["messages"]:
        print(message)

    # Format error checks
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    # Report errors if any
    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")

### Create a Fine Tune Dataset

For fine-tuning, we are downloading several modules from the CUAD datasets from NGuha's LegalBench dataset. This is an adaptation of the Atticus Project's Contract Understanding Atticus Dataset (https://www.atticusprojectai.org/cuad). This is an expert labeled dataset of real contract clauses, then binary labels of whether or not they contain a particular property.

An example entry from the CUAD Dataset looks like this:

> *Text: FMI hereby grants to Roche a non-exclusive, royalty-free, worldwide, perpetual, and sublicensable license to any intellectual property arising from the Immunotherapy Testing Platform Program Controlled by FMI for internal research purposes and to the extent necessary for Roche to research, develop, make, have made, use, offer for sale, sell, import and commercialize Roche products other than diagnostic products.*

> *Answer: Yes*

> *Task: Classify if the clause contains a license granted by one party to its counterparty.  Answer "Yes" or "No" and no other text.*

CUAD does not come with tasks, but it was adapted by the LegalBench team to include a labeling task for LLM testing. (https://arxiv.org/pdf/2308.11462)

We are randomly sampling 60 CUAD labeled task pairs from LegalBench for the training set, ten 'Yes' pairs and five 'No' pairs from each of four CUAD datasets. In the LegalBench HF repo, the CUAD labeled pairs are not stored with their Tasks. We are doing some hacky string additions to the dataset to combine the task instructions with the raw dataset files.

You are supposed to fine-tune on examples where the model has known performance problems, or to model the behavior of a stronger model (e.g. synthetic data from GPT-4), so this random selection training is far from ideal.




In [None]:
# Globals for downloading the Legalbench Consumer Contracts dataset

hf_repo = 'nguha/legalbench'
hf_repo_type = 'dataset'
file_dict = {
    "data/cuad_cap_on_liability/test.tsv": "cuad_cap_on_liability_test.tsv",
    "data/cuad_ip_ownership_assignment/test.tsv": "cuad_ip_ownership_assignment_test.tsv",
    "data/cuad_irrevocable_or_perpetual_license/test.tsv": "cuad_irrevocable_or_perpetual_license_test.tsv",
    "data/cuad_license_grant/test.tsv": "cuad_license_grant_test.tsv",
}

In [None]:
fetch_files(hf_repo, hf_repo_type, file_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


data/cuad_cap_on_liability/test.tsv:   0%|          | 0.00/573k [00:00<?, ?B/s]

Moved: data/cuad_cap_on_liability/test.tsv to /content/cuad_cap_on_liability_test.tsv


(…)ta/cuad_ip_ownership_assignment/test.tsv:   0%|          | 0.00/288k [00:00<?, ?B/s]

Moved: data/cuad_ip_ownership_assignment/test.tsv to /content/cuad_ip_ownership_assignment_test.tsv


(…)rrevocable_or_perpetual_license/test.tsv:   0%|          | 0.00/157k [00:00<?, ?B/s]

Moved: data/cuad_irrevocable_or_perpetual_license/test.tsv to /content/cuad_irrevocable_or_perpetual_license_test.tsv


data/cuad_license_grant/test.tsv:   0%|          | 0.00/694k [00:00<?, ?B/s]

Moved: data/cuad_license_grant/test.tsv to /content/cuad_license_grant_test.tsv


In [None]:
# Building DFs of Labeled Clauses and Tasks

cuad_cap_task = 'Classify if the clause specifies a cap on liability upon the breach of a party\u2019s \
obligation? This includes time limitation for the counterparty to bring claims or maximum amount for recovery. Answer "Yes" or "No" and no other text.'
cuad_cap_on_liability = '/content/cuad_cap_on_liability_test.tsv'
cuad_cap_on_liability_df = pd.read_csv(cuad_cap_on_liability, sep='\t')

cuad_ip_own_task = 'Classify if the clause specifies that intellectual property created by one party become the property of the counterparty, either per the terms of the contract or upon the occurrence of certain events.  Answer "Yes" or "No" and no other text.'
cuad_ip_ownership_assignment_test = '/content/cuad_ip_ownership_assignment_test.tsv'
cuad_ip_ownership_assignment_df = pd.read_csv(cuad_ip_ownership_assignment_test, sep='\t')

cuad_irrevocable_task = 'Classify if the clause specifies a license grant that is irrevocable or perpetual.  Answer "Yes" or "No" and no other text.'
cuad_irrevocable_or_perpetual_license_test = '/content/cuad_irrevocable_or_perpetual_license_test.tsv'
cuad_irrevocable_or_perpetual_license_df = pd.read_csv(cuad_irrevocable_or_perpetual_license_test, sep='\t')

cuad_license_task = 'Classify if the clause contains a license granted by one party to its counterparty.  Answer "Yes" or "No" and no other text.'
cuad_license_grant_test = '/content/cuad_license_grant_test.tsv'
cuad_license_grant_df = pd.read_csv(cuad_license_grant_test, sep='\t')

In [None]:
cuad_cap_on_liability_df['task'] = cuad_cap_task
cuad_ip_ownership_assignment_df['task'] = cuad_ip_own_task
cuad_irrevocable_or_perpetual_license_df['task'] = cuad_irrevocable_task
cuad_license_grant_df['task'] = cuad_license_task

In [None]:
df_list = [cuad_cap_on_liability_df, cuad_ip_ownership_assignment_df, cuad_irrevocable_or_perpetual_license_df, cuad_license_grant_df]

training_data = []

for df in df_list:
    training_data.extend(create_training_data(df, 10, 5))

#### Save the Training Data to JSONL, Test then Upload to OpenAI

We are following the detailed instructions here: https://platform.openai.com/docs/guides/fine-tuning

I recommend reading this article as well, or keeping it open as you go through this section.

There is also a linked Cookbook article on validating and checking your finetune dataset with additional checks for estimating costs and fixing errors. (https://cookbook.openai.com/examples/chat_finetuning_data_prep)

Token counting is less critical today because gpt-4o mini supports 65k length training examples, but if you encounter other errors, check here.

**NOTE** I've wrapped the fine tuning job request into a function for convenience, but it is really bare-bones and there are more interesting things you can do with hyper-parameters if you know how.

In [None]:
# Functions to initiate a fine tune job

def upload_file_to_openai(api_key, data_path):
    # Initialize the OpenAI client
    client = OpenAI(api_key=api_key)

    # Upload the file and capture the response
    response = client.files.create(
        file=open(data_path, "rb"),
        purpose="fine-tune"
    )

    # Print the FileObject id value from the response
    print("FileObject id:", response.id)

    # Return the FileObject
    return response.id, response

def create_fine_tuning_job(api_key, training_file_id, model, suffix):
    # Initialize the OpenAI client
    client = OpenAI(api_key=api_key)

    # Create the fine-tuning job and capture the response
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        model=model,
        suffix=suffix
    )

    # Print the fine-tuning job id value from the response
    print("Fine-tuning response:", response)

    # Return the response object
    return response

In [None]:
## Name your file here. I called mine 'cuad_training_data.jsonl'

training_data_file_name = 'cuad_training_data.jsonl'

def save_to_jsonl(data, filename):
    with open(filename, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

my_data = save_to_jsonl(training_data, training_data_file_name)

In [None]:
## This runs OpenAI's recommended validators
data_path = '/content/cuad_training_data.jsonl'

check_finetune_dataset(data_path)

In [None]:
# Upload your File. This should take seconds to complete.

training_file_id, _ = upload_file_to_openai(openai_key, data_path)

In [None]:
# Fine Tune your model. This may take several minutes or possibly an hour or two.
#4:24 finetune job started | 4:39 finetune job finished.

model = 'gpt-4o-mini-2024-07-18'
suffix = 'cuad_finetune_2' #add a suffix parameter string to the name of your model

# Create the fine-tuning job
fine_tuning_job = create_fine_tuning_job(openai_key, training_file_id, model, suffix)

## Testing our Fine Tune Model

We are going to test our model Fine-Tuned on CUAD Labeled Pairs.

We will generate a Test dataset from the CUAD Files, and then run Vanilla GPT-4o mini against our Fine-Tuned Model.

### Check your email for the Model Name.

You will receive an email from OpenAI when your Fine Tune job is completed, which contains your model name. For example purposes, my model is:

`ft_model = 'ft:gpt-4o-mini-2024-07-18:leo-s-laboratory:cuad-finetune:9p4146Ce'`

### Modify the Test Set Size

You can modify the size of the Test Set from these parameters:

>     test_data.append(create_test_data(df, 20, 10))

The two integers represent the number of 'Yes' labeled pairs and 'No' labeled pairs to include from the CUAD Parent Datasets. The Full LegalBench CUAD datasets contain hundreds or thousands of labeled pairs. We are sampling 30 pairs from each of four datasets, or 120 pairs.

In [None]:
# Create Test Data

# List of DataFrames
df_list = [
    cuad_cap_on_liability_df,
    cuad_ip_ownership_assignment_df,
    cuad_irrevocable_or_perpetual_license_df,
    cuad_license_grant_df
]

# Initialize an empty list to hold the combined DataFrames
test_data = []

# Apply the function to each DataFrame and append the results
for df in df_list:
    test_data.append(create_test_data(df, 20, 10))

# Concatenate all the DataFrames in the list into a single DataFrame
combined_test_data = pd.concat(test_data, ignore_index=True)
len(combined_test_data)

In [None]:
#### If using a few shot prompt, set few_shot to True. Otherwise False.
zero_shot_prompt = """Answer "Yes" or "No" and no other text."""

# Define the parameters
cuad_test_params = {
    'test_df': combined_test_data,
    'source_column': 'text',
    'question_column': 'task',
    'shot_prompt': zero_shot_prompt,
    'few_shot': False,
    'model': 'gpt-4o-mini',
    'start_index': 0,
    'end_index': 200,
    'max_concurrency': 40,
}

In [None]:
# Run our test using the base model

cuad_test_df = await run_test_set_async(**cuad_test_params)

In [None]:
# replace the ft_model value with your model's name.

ft_model = 'ft:gpt-4o-mini-2024-07-18:leo-s-laboratory:cuad-finetune-2:9pOp1jOF'
zero_shot_prompt = """Answer "Yes" or "No" and no other text."""

# Define the parameters
ft_cuad_test_params = {
    'test_df': combined_test_data,
    'source_column': 'text',
    'question_column': 'task',
    'shot_prompt': zero_shot_prompt,
    'few_shot': False,
    'model': ft_model,
    'start_index': 0,
    'end_index': 150,
    'max_concurrency': 25,
}

In [None]:
# Run our test using the fine tune model

cuad_test_df = await run_test_set_async(**ft_cuad_test_params)

In [None]:
gpt_columns = [column for column in cuad_test_df.columns if 'gpt-' in column]

# Loop through the filtered columns
for column in gpt_columns:
    cuad_test_df[column] = cuad_test_df[column].str.replace('.', '', regex=False)
    score = score_test_results(cuad_test_df, answer_column='answer', model_answer_column=column)
    print("\n-------------\n")