## Jupyter Notebook to Add Noise to Clauses, Generating More

This notebook accesses the openAI API to fine tune and generate synthetic clauses. 

While inexpensive, this is not free. Users should use pre-generated clauses. This is just included to demonstrate the process

In [None]:
from tclp.clause_recommender import utils
import os
from openai import OpenAI
import json
import re
import random

## Loading Clauses

In [None]:
folder_path = "../../data/cleaned_clauses"
clauses, clause_names = utils.load_clauses(folder_path)

In [None]:
# converting to JSON
clauses_list = []
for i in range(len(clause_names)):
    clauses_list.append({"name": clause_names[i], "content": clauses[i]})
clauses_json = json.dumps(clauses_list, indent=4)

In [None]:
def view_clause_json(clause_list, index_to_view=1):
    if 0 <= index_to_view < len(clause_list):
        print(json.dumps(clause_list[index_to_view], indent=4))
    else:
        print("Invalid index")

In [None]:
view_clause_json(clauses_list)

In [None]:
# change keys in JSON to input and output


def change_keys(clause_list):
    for i in range(len(clause_list)):
        clause_list[i] = {
            "input": clause_list[i]["name"],
            "output": clause_list[i]["content"],
        }
    return clause_list

In [None]:
clause_list = change_keys(clauses_list)

In [None]:
view_clause_json(clause_list)

In [None]:
client = OpenAI()

In [None]:
def generate_summary(clause_list):
    for i in range(len(clause_list)):
        prompt = (
            f"Summarize the following clause in three words:\n\n"
            f"{clause_list[i]['output']}\n\n"
            "Summary (three words):"
        )
        try:
            # Use the OpenAI Completion API
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",  # Use a completion-based model
                prompt=prompt,  # Pass the prompt
                max_tokens=10,  # Limit response length
                temperature=0.7,  # Adjust creativity level
            )
            print(response)
            # Extract the generated summary from the response
            summary = response.choices[0].text.strip()
            clause_list[i]["summary"] = summary
        except Exception as e:
            print(f"Error generating summary for clause {i}: {e}")
            clause_list[i]["summary"] = "Error generating summary"
    return clause_list

In [None]:
summarized_clauses = generate_summary(clause_list)

In [None]:
print(json.dumps(summarized_clauses, indent=4))

In [None]:
def process_summarized_clauses(summarized_clauses):
    summary_words_set = set()
    for clause in summarized_clauses:
        if "summary" in clause and clause["summary"] != "Error generating summary":
            clause["input"] = (
                f"Please write me a legal clause that could be integrated into a contract for {clause['summary']}."
            )
            summary_words_set.update(clause["summary"].split())
        else:
            clause["summary"] = "No summary available"
            clause["input"] = (
                "Please write me a legal clause that could be integrated into a contract for No summary available."
            )

        clause.pop("summary", None)
    summary_words_list = sorted(summary_words_set)

    return summarized_clauses, summary_words_list

In [None]:
updated_summarized_clauses, summary_words = process_summarized_clauses(
    summarized_clauses
)

In [None]:
len(updated_summarized_clauses)

In [None]:
view_clause_json(updated_summarized_clauses, 11)

In [None]:
def clean_prompts(clauses):
    fine_tuning_data_list = []
    for clause in clauses:
        # Clean prompt (formerly 'input')
        prompt = clause["input"]
        prompt = re.sub(r"\\n|\\u00a0|\n|\u00a0", " ", prompt)
        prompt = prompt.replace("\u2019", "'")
        prompt = re.sub(r"\[\u25cf\]", "", prompt)
        prompt = re.sub(r"\s+", " ", prompt).strip()
        prompt = prompt.rstrip(".") + "."

        # Clean completion
        completion = clause["output"]
        completion = re.sub(r"\\n|\\u00a0|\n|\u00a0", " ", completion)
        completion = completion.replace("\u2019", "'")
        completion = re.sub(r"\[\u25cf\]", "", completion)
        completion = re.sub(r"\s+", " ", completion).strip()

        if not completion.endswith(" [END]"):
            completion += " [END]"

        # Prepare the fine-tuning JSON object
        fine_tuning_data = {"prompt": prompt, "completion": " " + completion}
        fine_tuning_data_list.append(fine_tuning_data)

    return fine_tuning_data_list

In [None]:
final_clauses = clean_prompts(updated_summarized_clauses)

In [None]:
len(final_clauses)

In [None]:
view_clause_json(final_clauses, 11)

In [None]:
# this can be used to make new unique pairs of three words
print("Unique words in summaries:", summary_words)
len(summary_words)

In [None]:
output_file = "../fine_tuning.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for clause in final_clauses:
        json.dump(clause, f, ensure_ascii=False)
        f.write("\n")

In [None]:
def convert_to_chat_format(input_file, output_file):
    chat_data = []
    with open(input_file, "r", encoding="utf-8") as infile:
        for line in infile:
            entry = json.loads(line)
            chat_entry = {
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that writes legal clauses.",
                    },
                    {"role": "user", "content": entry["prompt"].strip()},
                    {"role": "assistant", "content": entry["completion"].strip()},
                ]
            }
            chat_data.append(chat_entry)
    with open(output_file, "w", encoding="utf-8") as outfile:
        for chat_entry in chat_data:
            json.dump(chat_entry, outfile, ensure_ascii=False)
            outfile.write("\n")

    print(f"Converted data saved to {output_file}")

In [None]:
input_file = "../fine_tuning.jsonl"
output_file = "../chat_fine_tuning.jsonl"
convert_to_chat_format(input_file, output_file)

## Fine tuning

In [None]:
with open("../chat_fine_tuning.jsonl", "rb") as file:
    response = client.files.create(file=file, purpose="fine-tune")
    file_id = response.id
    print(f"Uploaded file ID: {file_id}")

In [None]:
response = client.fine_tuning.jobs.create(
    training_file=file_id, model="gpt-3.5-turbo", hyperparameters={"n_epochs": 4}
)
fine_tune_job_id = response.id
print(f"Fine-tuning job created. ID: {fine_tune_job_id}")

In [None]:
events = client.fine_tuning.jobs.list_events(fine_tune_job_id)
for event in events:
    print(event)

In [None]:
import time

while True:
    response = client.fine_tuning.jobs.retrieve(fine_tune_job_id)
    print(f"Fine-tuning status: {response.status}")

    # Exit the loop if the job is complete
    if response.status in ["succeeded", "failed"]:
        break

    time.sleep(30)

In [None]:
response = client.fine_tuning.jobs.retrieve("ftjob-ewzRvC7cN4lfgv06G2Pd0YMq")
fine_tuned_model = response.fine_tuned_model

print(f"Fine-tuned model name: {fine_tuned_model}")

In [None]:
def create_a_clause(client, fine_tuned_model, prompt):
    response = client.chat.completions.create(
        model=fine_tuned_model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that generates legal clauses. Make sure you are combining the keywords, not just writing separate components for each. Pay attention to starting a clause in a realistic way.",
            },
            {
                "role": "system",
                "content": "Every clause MUST be related to sustainability.",
            },
            {
                "role": "user",
                "content": f"Please write me a legal clause that could be integrated into a contract for key words: {prompt}.",
            },
        ],
        max_tokens=1000,
        temperature=1,
    )
    return response

In [None]:
clause_test = create_a_clause(
    client, fine_tuned_model, "confidentiality, offsetting, access"
)

In [None]:
clause_test.choices[0].message.content.strip()

In [None]:
# with 391 words, how many permutations of 3 words can be made?
import math

math.comb(391, 3)

In [None]:
# I want to make 1800 new clauses, so I need to generate 1800 permutations
def create_permutations(summary_words, num_permutations):
    permutations_list = []
    while len(permutations_list) < 1800:
        random.shuffle(summary_words)
        # select 3 words from the list of unique words
        to_add = []
        while to_add not in permutations_list:
            to_add = summary_words[:3]
            permutations_list.append(to_add)

    return permutations_list

In [None]:
permutations = create_permutations(summary_words, 1800)

In [None]:
len(permutations)

In [None]:
permutations[4]

In [None]:
new_clauses = []

In [None]:
# create 1800 new clauses
count = 1
for i in range(1660, len(permutations)):
    response = create_a_clause(client, fine_tuned_model, ", ".join(permutations[i]))
    print(count, permutations[i])
    count += 1
    new_clauses.append(response.choices[0].message.content.strip())

In [None]:
len(new_clauses)

In [None]:
def save_new_clauses(new_clauses, folder_path):
    os.makedirs(folder_path, exist_ok=True)

    for i in range(len(new_clauses)):
        file_path = os.path.join(folder_path, f"new_clause_{i}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(new_clauses[i])

In [None]:
folder_path = "../../data/cleaned_gen_clauses"

In [None]:
save_new_clauses(new_clauses, folder_path)