## Jupyter Notebook to Add Noise to Clauses, Generating More

In [None]:
import utils
import os
from openai import OpenAI
import json
import re
import random

## Loading Clauses

In [35]:
folder_path = '../tclp_content/England:Wales'
clauses, clause_names = utils.load_clauses(folder_path)

In [36]:
#converting to JSON
clauses_list = []
for i in range(len(clause_names)):
    clauses_list.append({
        "name": clause_names[i],
        "content": clauses[i]
    })
clauses_json = json.dumps(clauses_list, indent=4)

In [37]:
def view_clause_json (clause_list, index_to_view = 1): 
    if 0 <= index_to_view < len(clause_list):
        print(json.dumps(clause_list[index_to_view], indent=4))
    else:
        print("Invalid index")

In [38]:
view_clause_json(clauses_list)

{
    "name": "Management_Equity_Ratchet_Terms.txt",
    "content": "1. Conversion Rights\n1.1 (see Definitions)\n1.2 The purpose of this Article [\u25cf] is to adjust the share capital of the Company so that the A Shareholder Proceeds and the B Shareholder Proceeds shall be the proportions of the Capitalisation Value calculated in accordance with this Article [\u25cf].2:\n1.2.1 firstly, the Capitalisation Value shall be split between the A Shareholder Proceeds and B Shareholder Proceeds in the ratio of [75:25] until the Target Amount shall have been received by the Investor[s]; and\n1.2.2 secondly, provided a Ratchet Trigger has occurred, the balance (if any) of the Capitalisation Value (after deducting the amount allocated under Article [\u25cf].2.1) shall be split between the A Shareholder Proceeds and the B Shareholder Proceeds in the ratio [65:35].\n1.3 On the Conversion Date, conditionally upon the occurrence of the relevant Conversion Event, such number of A Ordinary Shares shal

In [39]:
#change keys in JSON to input and output 

def change_keys (clause_list):
    for i in range(len(clause_list)):
        clause_list[i] = {
            "input": clause_list[i]["name"],
            "output": clause_list[i]["content"]
        }
    return clause_list


In [40]:
clause_list = change_keys(clauses_list)

In [41]:
view_clause_json(clause_list)

{
    "input": "Management_Equity_Ratchet_Terms.txt",
    "output": "1. Conversion Rights\n1.1 (see Definitions)\n1.2 The purpose of this Article [\u25cf] is to adjust the share capital of the Company so that the A Shareholder Proceeds and the B Shareholder Proceeds shall be the proportions of the Capitalisation Value calculated in accordance with this Article [\u25cf].2:\n1.2.1 firstly, the Capitalisation Value shall be split between the A Shareholder Proceeds and B Shareholder Proceeds in the ratio of [75:25] until the Target Amount shall have been received by the Investor[s]; and\n1.2.2 secondly, provided a Ratchet Trigger has occurred, the balance (if any) of the Capitalisation Value (after deducting the amount allocated under Article [\u25cf].2.1) shall be split between the A Shareholder Proceeds and the B Shareholder Proceeds in the ratio [65:35].\n1.3 On the Conversion Date, conditionally upon the occurrence of the relevant Conversion Event, such number of A Ordinary Shares shal

In [50]:
client = OpenAI()

In [75]:
def generate_summary(clause_list):
    for i in range(len(clause_list)):
        prompt = (
            f"Summarize the following clause in three words:\n\n"
            f"{clause_list[i]['output']}\n\n"
            "Summary (three words):"
        )
        try:
            # Use the OpenAI Completion API
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",  # Use a completion-based model
                prompt=prompt,           # Pass the prompt
                max_tokens=10,           # Limit response length
                temperature=0.7,         # Adjust creativity level
            )
            print(response)
            # Extract the generated summary from the response
            summary = response.choices[0].text.strip()
            clause_list[i]["summary"] = summary
        except Exception as e:
            print(f"Error generating summary for clause {i}: {e}")
            clause_list[i]["summary"] = "Error generating summary"
    return clause_list

In [78]:
summarized_clauses = generate_summary(clause_list)

Completion(id='cmpl-AUyO6LmfmBGkC1SykUQJn6IXk9jMO', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' Climate impact analysis')], created=1731945334, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=3, prompt_tokens=2065, total_tokens=2068, completion_tokens_details=None, prompt_tokens_details=None))
Completion(id='cmpl-AUyO7L4gSEWdxDoL0Gn4Lt5ta30bF', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' Share conversion rights')], created=1731945335, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=3, prompt_tokens=946, total_tokens=949, completion_tokens_details=None, prompt_tokens_details=None))
Completion(id='cmpl-AUyO8kwtk8DP5uYLDQYgqvR2qLSu1', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' Environmental sustainability measures.')], created=1731945336

In [79]:
print(json.dumps(summarized_clauses, indent=4))

[
    {
        "input": "Template_Board_Paper_for_Significant_Contracts__Tr.txt",
        "output": "1. Strategy implications\nAlignment\u00a0\n1.1 The proposal to enter into the [significant contract OR transaction] is [fully OR partially OR not] aligned to our agreed [decarbonisation OR net-zero] target*, interim emissions-reductions targets, Paris-aligned business strategy and operational plan on climate change, and in particular the delivery of near- and medium-term milestones that are a prerequisite to reaching the longer-term [decarbonisation OR net-zero] target, for the following reasons. [\nExplain and insert reasons. If the contract is aligned in one aspect but not another, please explain.\n]\n* \n\n1.2 The [significant contract OR transaction] will help us achieve our net-zero strategy by enabling us to deliver on the\u00a0interim industrial or mergers and acquisitions (M&A) goals that are consistent with our near- and medium-term emissions-reductions targets for the followi

In [90]:
def process_summarized_clauses(summarized_clauses):
    summary_words_set = set() 
    for clause in summarized_clauses:
        if "summary" in clause and clause["summary"] != "Error generating summary":
            clause["input"] = f"Please write me a legal clause that could be integrated into a contract for {clause['summary']}."
            summary_words_set.update(clause["summary"].split())
        else:
            clause["summary"] = "No summary available"
            clause["input"] = "Please write me a legal clause that could be integrated into a contract for No summary available."

        clause.pop("summary", None)
    summary_words_list = sorted(summary_words_set)

    return summarized_clauses, summary_words_list

In [91]:
updated_summarized_clauses, summary_words = process_summarized_clauses(summarized_clauses)


In [92]:
len(updated_summarized_clauses)

207

In [98]:
view_clause_json(updated_summarized_clauses, 11)

{
    "input": "Please write me a legal clause that could be integrated into a contract for Remaining greenhouse gases..",
    "output": "2\n\nResidual Emissions \nmeans [a party\u2019s] \nGHG Emissions\n [from all operations including its value chain] that are emitted after all reasonable efforts have been made to reduce them.\n\nResidual Emissions\u00a0\nare any\u00a0\nGHG Emissions\n\u00a0which remain after a project or organisation has implemented all technically and economically feasible opportunities, as determined by a\u00a0\nClimate Professional\u00a0\nas part of a\u00a0\nCarbon Footprint\u00a0\nassessment, to reduce emissions in all scopes and from all sources."
}


In [132]:
def clean_prompts(clauses): 
    fine_tuning_data_list = []
    for clause in clauses:
        # Clean prompt (formerly 'input')
        prompt = clause["input"]
        prompt = re.sub(r'\\n|\\u00a0|\n|\u00a0', ' ', prompt)  
        prompt = prompt.replace('\u2019', "'")
        prompt = re.sub(r'\[\u25cf\]', '', prompt)
        prompt = re.sub(r'\s+', ' ', prompt).strip()  
        prompt = prompt.rstrip('.') + '.'

        # Clean completion
        completion = clause["output"]
        completion = re.sub(r'\\n|\\u00a0|\n|\u00a0', ' ', completion)  
        completion = completion.replace('\u2019', "'")
        completion = re.sub(r'\[\u25cf\]', '', completion)
        completion = re.sub(r'\s+', ' ', completion).strip() 
        
        if not completion.endswith(" [END]"):
            completion += " [END]"
            
        # Prepare the fine-tuning JSON object
        fine_tuning_data = {
            "prompt": prompt,
            "completion": " " + completion  
        }
        fine_tuning_data_list.append(fine_tuning_data)
    
    
    return fine_tuning_data_list


In [133]:
final_clauses = clean_prompts(updated_summarized_clauses)

In [134]:
len(final_clauses)

207

In [135]:
view_clause_json(final_clauses, 11)

{
    "prompt": "Please write me a legal clause that could be integrated into a contract for Remaining greenhouse gases.",
    "completion": " 2 Residual Emissions means [a party's] GHG Emissions [from all operations including its value chain] that are emitted after all reasonable efforts have been made to reduce them. Residual Emissions are any GHG Emissions which remain after a project or organisation has implemented all technically and economically feasible opportunities, as determined by a Climate Professional as part of a Carbon Footprint assessment, to reduce emissions in all scopes and from all sources. [END]"
}


In [136]:
#this can be used to make new unique pairs of three words 
print("Unique words in summaries:", summary_words)
len(summary_words)

Unique words in summaries: ['"Offsetting', 'Access', 'Achieve', 'Achieved', 'Achieving', 'Adoption', 'Agreement', 'Air', 'Alignment', 'Alternative', 'Appointment,', 'Approval', 'Assessment', 'Assurance', 'Atmospheric', 'Balanced', 'Benchmark', 'Biodiversity', 'Budget', 'CCS/CCUS', 'Calculation.', 'Carbon', 'Certified', 'Change', 'Circular', 'Clean,', 'Climate', 'Climate,', 'Climate-aligned', 'Climate-conscious', 'Climate-friendly', 'Collaboration,', 'Collaborative', 'Commercial', 'Commit,', "Company's", 'Completion', 'Compliance', 'Conditions', 'Conditions,', 'Construction', 'Contractual', 'Conversion', 'Covenant', 'Credit', 'Customer', 'Data', 'Definition', 'Development', 'Disclosure', 'Dwellings', 'ESG', 'ESG-inclusive', 'Eco-friendly', 'Effects', 'Efficient,', 'Electronic', 'Eligibility,', 'Emission', 'Emissions', 'Energy', 'Environmental', 'Events', 'Excess', 'Food', 'Footprint', 'Future-focused', 'GHG', 'Garden', 'Gas', 'Gases', 'Governance,', 'Grant,', 'Green', 'Green,', 'Greenho

391

In [139]:
output_file = "fine_tuning.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for clause in final_clauses:
        json.dump(clause, f, ensure_ascii=False)  
        f.write("\n") 

In [None]:
def convert_to_chat_format(input_file, output_file):
    chat_data = []
    with open(input_file, "r", encoding="utf-8") as infile:
        for line in infile:
            entry = json.loads(line)
            chat_entry = {
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant that writes legal clauses."},
                    {"role": "user", "content": entry["prompt"].strip()},
                    {"role": "assistant", "content": entry["completion"].strip()}
                ]
            }
            chat_data.append(chat_entry)
    with open(output_file, "w", encoding="utf-8") as outfile:
        for chat_entry in chat_data:
            json.dump(chat_entry, outfile, ensure_ascii=False)
            outfile.write("\n")

    print(f"Converted data saved to {output_file}")

In [194]:
input_file = "fine_tuning.jsonl" 
output_file = "chat_fine_tuning.jsonl"  
convert_to_chat_format(input_file, output_file)

Converted data saved to chat_fine_tuning.jsonl


## Fine tuning

In [195]:
with open("chat_fine_tuning.jsonl", "rb") as file:
    response = client.files.create(file=file, purpose="fine-tune")
    file_id = response.id
    print(f"Uploaded file ID: {file_id}")

Uploaded file ID: file-VQ6yxGheSrTk5o9Le6RwJtIi


In [196]:
response = client.fine_tuning.jobs.create(
    training_file=file_id,
    model="gpt-3.5-turbo",
      hyperparameters={
    "n_epochs":4
  }
)
fine_tune_job_id = response.id
print(f"Fine-tuning job created. ID: {fine_tune_job_id}")

Fine-tuning job created. ID: ftjob-ewzRvC7cN4lfgv06G2Pd0YMq


In [200]:
events = client.fine_tuning.jobs.list_events(fine_tune_job_id)
for event in events:
    print(event)

FineTuningJobEvent(id='ftevent-oQ5flB6iUicLdOP9SkqkgguR', created_at=1731948589, level='info', message='Fine-tuning job started', object='fine_tuning.job.event', data=None, type='message')
FineTuningJobEvent(id='ftevent-03Ml9TS5yXWc2s0k4bXKvkpZ', created_at=1731948587, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message')
FineTuningJobEvent(id='ftevent-6MlKBA2t5Jm9goDzlPmqdqaQ', created_at=1731948554, level='info', message='Validating training file: file-VQ6yxGheSrTk5o9Le6RwJtIi', object='fine_tuning.job.event', data={}, type='message')
FineTuningJobEvent(id='ftevent-09XahdGovzUtq7MXumVZpqpE', created_at=1731948554, level='info', message='Created fine-tuning job: ftjob-ewzRvC7cN4lfgv06G2Pd0YMq', object='fine_tuning.job.event', data={}, type='message')


In [202]:
import time

while True:
    response = client.fine_tuning.jobs.retrieve(fine_tune_job_id)
    print(f"Fine-tuning status: {response.status}")

    # Exit the loop if the job is complete
    if response.status in ['succeeded', 'failed']:
        break

    time.sleep(30) 

Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: running
Fine-tuning status: 

In [204]:
response = client.fine_tuning.jobs.retrieve("ftjob-ewzRvC7cN4lfgv06G2Pd0YMq")
fine_tuned_model = response.fine_tuned_model

print(f"Fine-tuned model name: {fine_tuned_model}")

Fine-tuned model name: ft:gpt-3.5-turbo-0125:personal::AUzZxWPY


In [269]:
def create_a_clause(client, fine_tuned_model, prompt):
    response = client.chat.completions.create(
        model=fine_tuned_model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates legal clauses. Make sure you are combining the keywords, not just writing separate components for each. Pay attention to starting a clause in a realistic way."},
            {"role": "system", "content": "Every clause MUST be related to sustainability."},
            {"role": "user", "content": f"Please write me a legal clause that could be integrated into a contract for key words: {prompt}."}
        ],
        max_tokens=1000,
        temperature=1
    )
    return response

In [262]:
clause_test = create_a_clause(client, fine_tuned_model, "confidentiality, offsetting, access")

In [218]:
clause_test.choices[0].message.content.strip()

'(A) The Company recognises the value of the [Employee] accessing the Confidential Information to [undertake a Net Zero Activity][complete the Net Zero Project]. (B) The [Employee] agrees to keep the Confidential Information confidential and not use or disclose it for [their OR his OR her OR its] own use. (C) The Company agrees that the [Employee] may use the Confidential Information solely to [undertake a Net Zero Activity][complete the Net Zero Project] in connection with [their OR his OR her OR its] employment with the Company. (D) The Company and the [Employee] agree that the [Employee] may use the Confidential Information for the purposes of offsetting any Greenhouse Gas Emissions of the Company. [END]'

In [219]:
#with 391 words, how many permutations of 3 words can be made?
import math
math.comb(391, 3)

9886435

In [250]:
#I want to make 1800 new clauses, so I need to generate 1800 permutations
def create_permutations(summary_words, num_permutations):
    permutations_list = []
    while len(permutations_list) < 1800:
        random.shuffle(summary_words)
        #select 3 words from the list of unique words
        to_add = []
        while to_add not in permutations_list:
            to_add = summary_words[:3]
            permutations_list.append(to_add)
    
    return permutations_list

In [251]:
permutations = create_permutations(summary_words, 1800)

In [252]:
len(permutations)

1800

In [258]:
permutations[4]

['Climate,', 'Water', 'Access']

In [270]:
new_clauses = []

In [277]:
#create 1800 new clauses 
count = 1
for i in range(1660, len(permutations)):
    response = create_a_clause(client, fine_tuned_model, ", ".join(permutations[i]))
    print(count, permutations[i])
    count += 1
    new_clauses.append(response.choices[0].message.content.strip())

1 ['Assurance', 'fuel', 'reporting,']
2 ['rights', 'Benchmark', 'Partnership']
3 ['Assessment', 'goals,', 'Definition']
4 ['storage', 'purchase.', 'goals']
5 ['Modifications,', 'Remedies', 'development']
6 ['Modifications,', 'assets.', 'Emissions']
7 ['Gas', 'carbon', 'summary']
8 ['Assurance', 'measures.', 'target.']
9 ['Power', 'tree', 'footprint']
10 ['progress.', 'Sources', 'guide']
11 ['Definition', 'Footprint', 'Reductions,']
12 ['Excess', '"Offsetting', 'Renewable']
13 ['Agreement', 'Rights,', 'Strategic']
14 ['Access', 'land', 'Efficient,']
15 ['Green', 'management,', 'climate']
16 ['rights', 'process.', 'appointment']
17 ['Achieving', 'responsibilities', 'Investments']
18 ['Excess', 'consultant.', 'Payment']
19 ['Reduction', 'compliance,', "Company's"]
20 ['Alternative', 'Collaborative', 'implementation,']
21 ['targets,', 'encompass', 'obligations.']
22 ['appointment', 'Procurement', 'Multi-event']
23 ['and', 'options.', 'clauses']
24 ['conversion', 'checklist.', 'analysis']
2

In [278]:
len(new_clauses)

1807

In [282]:
def save_new_clauses(new_clauses, folder_path):
    os.makedirs(folder_path, exist_ok=True)
    
    for i in range(len(new_clauses)):
        file_path = os.path.join(folder_path, f"new_clause_{i}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(new_clauses[i])

In [283]:
folder_path = '../tclp_content/1800_clauses'

In [284]:
save_new_clauses(new_clauses, folder_path)