In [None]:
from dotenv import load_dotenv
load_dotenv('thesis.env')

from get_completion import get_completion_ollama, get_completion_claude3, get_completion_gpt, get_completion_llama_replicate
from metrics import cleanup_result_first, cleanup_result_last, metrics_mine, metrics_mine_dict, plot_confusion_matrix
from costs import get_completion_gpt_with_cost, get_completion_claude3_with_cost, get_completion_llama_replicate_with_cost
from helper_functions import load_text_files, get_embedding, choose_in_context_examples, choose_in_context_examples_2,\
prepare_fewshot_system_content, prepare_fewshot_prompt, append_context, append_context_and_source, appending_contract_to_prompt,\
sample_in_context_examples

import os
from openai import OpenAI
import openai
import anthropic
import voyageai
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
import random
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import itertools
import ast
import re
import pickle
import time
from tqdm import tqdm
import replicate

# Datasets

In [None]:
df_test = pd.read_csv('test_dataset.csv')


In [None]:
df_ic_examples = pd.read_csv('ic_examples_dataset.csv')

In [None]:
df_whole = pd.read_csv('whole_dataset.csv')

In [None]:
test_clauses = df_test['clause'].tolist()
test_risks = df_test['ground_truth_label'].tolist()
test_contract_types = df_test['contract_type'].tolist()
test_representing = df_test['representing'].tolist()
test_sources = df_test['source'].tolist()

In [None]:
N = len(test_clauses)

# Testing

In [None]:
system_content_step_by_step = f'You are a lawyer. Can you assess the risk of the following contract clause? \n The options are: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'
# in this system message, I don't include stuff like answer with this only as we want to produce cot first

In [None]:
def result_dataframe(experiment_name, init_system_content, model, setting, cleanining_method = 'basic', save_dir = 'results', init_prompt = None, which_ic_embed_type = None, n_fewshot_embed = None):
    """
    runs the test and returns a dataframe with all the necessary info
    
    """

    start_time = time.time()

    results = [] # list of dictionaries for each test data point
    list_for_metrics = [] # this is the cleaned results for calculating performance at the end
    N = 200 

    if model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo', 'gpt-3.5-turbo-instruct']:
        for i in tqdm(range(N)):

            if setting == 'normal':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i], test_representing[i])
                prompt = test_clauses[i]
            elif setting == 'few_shot_system': # in context examples in the system message
                system_content = init_system_content # prepare it in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_prompt': # in context examples in the prompt
                system_content = init_system_content 
                prompt = f"{init_prompt}\nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer: "
                #init_prompt += f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:"
                #prompt = init_prompt
            elif setting == 'step_by_step':
                cot_step_reasoning = get_completion_gpt(prompt= f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step.', system_content= init_system_content, model=model) # init system content is the step by step system content
                system_content = init_system_content
                prompt = f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step. {cot_step_reasoning} \nTherefore, the answer (only one of potential issue or red flag) is'
            elif setting == 'annollm': # examples in the prompt
                system_content = init_system_content
                prompt = f"{init_prompt} \nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" # init prompt is prepared in advance
            
            
            elif setting == 'annollm_system': 
                system_content = init_system_content # prepare in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" 
            
            
            elif setting == 'append_contract': # in system message
                system_content = append_context_and_source(init_system_content,test_contract_types[i],test_representing[i],test_sources[i])
                prompt = test_clauses[i]
            elif setting == 'append_contract_in_prompt':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i],test_representing[i])
                prompt = appending_contract_to_prompt(test_clauses[i],test_sources[i])
            elif setting == 'few_shot_embed_system': # in context examples are in the system message
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = prepare_fewshot_system_content(init_system_content,relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_embed_prompt': # ic examples are in the prompt
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = init_system_content
                init_prompt = prepare_fewshot_prompt(relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                init_prompt += f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:"
                prompt = init_prompt

            else:
                raise ValueError('Unsupported setting')

            model_output, input_tokens, output_tokens, total_cost = get_completion_gpt_with_cost(prompt, system_content, model)

            if cleanining_method == 'basic':
                cleaned_prediction = model_output
            elif cleanining_method == 'extract_last':
                cleaned_prediction = cleanup_result_last(model_output)
            elif cleanining_method == 'extract_first':
                cleaned_prediction = cleanup_result_first(model_output)
            else:
                raise ValueError("Unsupported cleaning method")
            
            list_for_metrics.append(cleaned_prediction)

            result = {
                'experiment_name': experiment_name,
                'model': model,
                'system_content': system_content,
                'prompt': prompt,
                'clause': test_clauses[i],
                'contract_type': test_contract_types[i],
                'representing': test_representing[i],
                'source': test_sources[i],
                'ground_truth_label': test_risks[i],
                'model_output': model_output,
                'cleaned_prediction': cleaned_prediction,
                'number of input tokens': input_tokens,
                'number of output tokens': output_tokens,
                'cost': total_cost
            }
            results.append(result)

        results_df = pd.DataFrame(results)

        metrics = metrics_mine_dict(test_risks[:N], list_for_metrics)
        results_df['accuracy'] = metrics['Accuracy']
        results_df['precision'] = metrics['Precision']
        results_df['recall'] = metrics['Recall']
        results_df['F1'] = metrics['F1']

        results_df['total cost of this experiment'] = results_df['cost'].sum()

        end_time = time.time()
        elapsed_time = end_time - start_time
        results_df['experiment ran for (seconds)'] = elapsed_time

        os.makedirs(save_dir, exist_ok=True)
        pickle_path = os.path.join(save_dir, f'{experiment_name} {model}.pkl')
        with open(pickle_path, 'wb') as f:
            pickle.dump(results_df, f)

        return results_df
    
    
    elif model in ['claude-3-opus-20240229', 'claude-3-sonnet-20240229', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620']:
        for i in tqdm(range(N)):

            if setting == 'normal':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i], test_representing[i])
                prompt = test_clauses[i]
            elif setting == 'few_shot_system':
                system_content = init_system_content # prepare it in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_prompt': # in context examples in the prompt
                system_content = init_system_content
                prompt = f"{init_prompt}\nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer: "     
            elif setting == 'step_by_step':
                cot_step_reasoning = get_completion_claude3(prompt= f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step.', system_content= init_system_content, model=model) # init system content is the step by step system content
                system_content = init_system_content
                prompt = f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step. {cot_step_reasoning} \nTherefore, the answer (only one of potential issue or red flag) is'
            elif setting == 'annollm':
                system_content = init_system_content
                prompt = f"{init_prompt} \nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" # init prompt is prepared in advance
                
            elif setting == 'annollm_system':
                system_content = init_system_content # prepare in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" 
            
            elif setting == 'append_contract':
                system_content = append_context_and_source(init_system_content,test_contract_types[i],test_representing[i],test_sources[i])
                prompt = test_clauses[i]
            elif setting == 'append_contract_in_prompt':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i],test_representing[i])
                prompt = appending_contract_to_prompt(test_clauses[i],test_sources[i])
            elif setting == 'few_shot_embed_system': # in context examples are in the system message
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = prepare_fewshot_system_content(init_system_content,relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_embed_prompt': # ic examples are in the prompt
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = init_system_content
                init_prompt = prepare_fewshot_prompt(relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                init_prompt += f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:"
                prompt = init_prompt

            else:
                raise ValueError('Unsupported setting')

            model_output, input_tokens, output_tokens, total_cost = get_completion_claude3_with_cost(prompt, system_content, model)

            if cleanining_method == 'basic':
                cleaned_prediction = model_output
            elif cleanining_method == 'extract_last':
                cleaned_prediction = cleanup_result_last(model_output)
            elif cleanining_method == 'extract_first':
                cleaned_prediction = cleanup_result_first(model_output)
            else:
                raise ValueError("Unsupported cleaning method")
            
            list_for_metrics.append(cleaned_prediction)

            result = {
                'experiment_name': experiment_name,
                'model': model,
                'system_content': system_content,
                'prompt': prompt,
                'clause': test_clauses[i],
                'contract_type': test_contract_types[i],
                'representing': test_representing[i],
                'source': test_sources[i],
                'ground_truth_label': test_risks[i],
                'model_output': model_output,
                'cleaned_prediction': cleaned_prediction,
                'number of input tokens': input_tokens,
                'number of output tokens': output_tokens,
                'cost': total_cost
            }
            results.append(result)

        results_df = pd.DataFrame(results)

        metrics = metrics_mine_dict(test_risks[:N], list_for_metrics)
        results_df['accuracy'] = metrics['Accuracy']
        results_df['precision'] = metrics['Precision']
        results_df['recall'] = metrics['Recall']
        results_df['F1'] = metrics['F1']

        results_df['total cost of this experiment'] = results_df['cost'].sum()

        end_time = time.time()
        elapsed_time = end_time - start_time
        results_df['experiment ran for (seconds)'] = elapsed_time

        os.makedirs(save_dir, exist_ok=True)
        pickle_path = os.path.join(save_dir, f'{experiment_name} {model}.pkl')
        with open(pickle_path, 'wb') as f:
            pickle.dump(results_df, f)

        return results_df

    

    elif model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
        for i in tqdm(range(N)):

            if setting == 'normal':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i], test_representing[i])
                prompt = test_clauses[i]
            elif setting == 'few_shot_system': # in context examples in the system message
                system_content = init_system_content # prepare it in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_prompt': # in context examples in the prompt
                system_content = init_system_content 
                prompt = f"{init_prompt}\nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer: "
            elif setting == 'step_by_step':
                cot_step_reasoning = get_completion_llama_replicate(prompt= f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step.', system_message= init_system_content, model=model) # init system content is the step by step system content
                system_content = init_system_content
                prompt = f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step. {cot_step_reasoning} \nTherefore, the answer (only one of potential issue or red flag) is'
            elif setting == 'annollm': # examples in the prompt
                system_content = init_system_content
                prompt = f"{init_prompt} \nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" # init prompt is prepared in advance
            
            
            elif setting == 'annollm_system': # try this with low N before running the whole thing - copy to others if it is working
                system_content = init_system_content # prepare in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" 
            
            
            elif setting == 'append_contract': # in system message
                system_content = append_context_and_source(init_system_content,test_contract_types[i],test_representing[i],test_sources[i])
                prompt = test_clauses[i]
            elif setting == 'append_contract_in_prompt':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i],test_representing[i])
                prompt = appending_contract_to_prompt(test_clauses[i],test_sources[i])
            elif setting == 'few_shot_embed_system': # in context examples are in the system message
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = prepare_fewshot_system_content(init_system_content,relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_embed_prompt': # ic examples are in the prompt
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = init_system_content
                init_prompt = prepare_fewshot_prompt(relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                init_prompt += f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:"
                prompt = init_prompt

            else:
                raise ValueError('Unsupported setting')

            model_output, input_tokens, output_tokens, total_cost = get_completion_llama_replicate_with_cost(prompt, system_content, model)

            if cleanining_method == 'basic':
                cleaned_prediction = model_output
            elif cleanining_method == 'extract_last':
                cleaned_prediction = cleanup_result_last(model_output)
            elif cleanining_method == 'extract_first':
                cleaned_prediction = cleanup_result_first(model_output)
            else:
                raise ValueError("Unsupported cleaning method")
            
            list_for_metrics.append(cleaned_prediction)

            result = {
                'experiment_name': experiment_name,
                'model': model,
                'system_content': system_content,
                'prompt': prompt,
                'clause': test_clauses[i],
                'contract_type': test_contract_types[i],
                'representing': test_representing[i],
                'source': test_sources[i],
                'ground_truth_label': test_risks[i],
                'model_output': model_output,
                'cleaned_prediction': cleaned_prediction,
                'number of input tokens': input_tokens,
                'number of output tokens': output_tokens,
                'cost': total_cost
            }
            results.append(result)

        results_df = pd.DataFrame(results)

        metrics = metrics_mine_dict(test_risks[:N], list_for_metrics)
        results_df['accuracy'] = metrics['Accuracy']
        results_df['precision'] = metrics['Precision']
        results_df['recall'] = metrics['Recall']
        results_df['F1'] = metrics['F1']

        results_df['total cost of this experiment'] = results_df['cost'].sum()

        end_time = time.time()
        elapsed_time = end_time - start_time
        results_df['experiment ran for (seconds)'] = elapsed_time
        
        
        if model == 'meta/meta-llama-3-8b-instruct':
            model_name = 'meta:llama3:8b'
        elif model == 'meta/meta-llama-3-70b-instruct':
            model_name = 'meta:llama3:70b'
        
        os.makedirs(save_dir, exist_ok=True)
        pickle_path = os.path.join(save_dir, f'{experiment_name} {model_name}.pkl')
        with open(pickle_path, 'wb') as f:
            pickle.dump(results_df, f)

        return results_df
    
    
    
    elif model in ['unsloth_model:latest', 'unsloth_model_3_epochs:latest', 'llama3:8b']: # no cost for these
        for i in tqdm(range(N)):

            if setting == 'normal':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i], test_representing[i])
                prompt = test_clauses[i]
            elif setting == 'few_shot_system': # in context examples in the system message
                system_content = init_system_content # prepare it in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_prompt': # in context examples in the prompt
                system_content = init_system_content 
                prompt = f"{init_prompt}\nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer: "
            elif setting == 'step_by_step':
                cot_step_reasoning = get_completion_llama_replicate(prompt= f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step.', system_message= init_system_content, model=model) # init system content is the step by step system content
                system_content = init_system_content
                prompt = f'Clause: {test_clauses[i]}  \nAnswer: Let\'s think step by step. {cot_step_reasoning} \nTherefore, the answer (only one of potential issue or red flag) is'
            elif setting == 'annollm': # examples in the prompt
                system_content = init_system_content
                prompt = f"{init_prompt} \nInformation: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" # init prompt is prepared in advance
            
            
            elif setting == 'annollm_system': # try this with low N before running the whole thing - copy to others if it is working
                system_content = init_system_content # prepare in advance
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:" 
            
            
            elif setting == 'append_contract': # in system message
                system_content = append_context_and_source(init_system_content,test_contract_types[i],test_representing[i],test_sources[i])
                prompt = test_clauses[i]
            elif setting == 'append_contract_in_prompt':
                system_content = append_context(init_system_content,test_clauses[i],test_contract_types[i],test_representing[i])
                prompt = appending_contract_to_prompt(test_clauses[i],test_sources[i])
            elif setting == 'few_shot_embed_system': # in context examples are in the system message
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = prepare_fewshot_system_content(init_system_content,relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                prompt = f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \n Clause: {test_clauses[i]} \n Answer:"
            elif setting == 'few_shot_embed_prompt': # ic examples are in the prompt
                if which_ic_embed_type == 'not_per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples(test_clauses[i],df_ic_examples,n_fewshot_embed)
                elif which_ic_embed_type == 'per_class':
                    relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type = choose_in_context_examples_2(test_clauses[i],df_ic_examples,n_fewshot_embed)
                else:
                    raise ValueError('Choose a valid way of picking embedding ic examples: "per_class" or "not_per_class"')
                system_content = init_system_content
                init_prompt = prepare_fewshot_prompt(relevant_clauses, relevant_risks, relevant_representing, relevant_contract_type)
                init_prompt += f"Information: The type of this contract is {test_contract_types[i]}. Representing side is the {test_representing[i]}. The governing law is England and Wales. \nClause: {test_clauses[i]} \nAnswer:"
                prompt = init_prompt

            
            else:
                raise ValueError('Unsupported setting')

            model_output = get_completion_ollama(prompt, system_content, model)

            if cleanining_method == 'basic':
                cleaned_prediction = model_output
            elif cleanining_method == 'extract_last':
                cleaned_prediction = cleanup_result_last(model_output)
            elif cleanining_method == 'extract_first':
                cleaned_prediction = cleanup_result_first(model_output)
            else:
                raise ValueError("Unsupported cleaning method")
            
            list_for_metrics.append(cleaned_prediction)

            result = {
                'experiment_name': experiment_name,
                'model': model,
                'system_content': system_content,
                'prompt': prompt,
                'clause': test_clauses[i],
                'contract_type': test_contract_types[i],
                'representing': test_representing[i],
                'source': test_sources[i],
                'ground_truth_label': test_risks[i],
                'model_output': model_output,
                'cleaned_prediction': cleaned_prediction,
            }
            results.append(result)

        results_df = pd.DataFrame(results)

        metrics = metrics_mine_dict(test_risks[:N], list_for_metrics)
        results_df['accuracy'] = metrics['Accuracy']
        results_df['precision'] = metrics['Precision']
        results_df['recall'] = metrics['Recall']
        results_df['F1'] = metrics['F1']

        end_time = time.time()
        elapsed_time = end_time - start_time
        results_df['experiment ran for (seconds)'] = elapsed_time

        os.makedirs(save_dir, exist_ok=True)
        pickle_path = os.path.join(save_dir, f'{experiment_name} {model}.pkl')
        with open(pickle_path, 'wb') as f:
            pickle.dump(results_df, f)

        return results_df
    
    
    
    else:
        raise ValueError("Unsupported model")



## Basic Zero-Shot Prompting

In [None]:
experiment_name = 'Basic zero-shot prompting.'
init_system_content_zeroshot_noinfo = 'You are a lawyer. Assess the risk of the following contract clause. Answer with one of these two options and nothing else: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('Basic zero-shot prompting',init_system_content_zeroshot_noinfo, model, 'normal','basic','results')  

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('Basic zero-shot prompting',init_system_content_zeroshot_noinfo, model, 'normal','basic','results')

In [None]:
model = 'meta/meta-llama-3-8b-instruct'
result_dataframe('Basic zero-shot prompting',init_system_content_zeroshot_noinfo, model, 'normal','basic','results')

In [None]:
model = 'meta/meta-llama-3-70b-instruct'
result_dataframe('Basic zero-shot prompting',init_system_content_zeroshot_noinfo, model, 'normal','basic','results')

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('Basic zero-shot prompting',init_system_content_zeroshot_noinfo, model, 'normal','basic','results')

## Zero-shot CoT with legal reasoning template B

In [None]:
init_system_content_legal_template = 'You are a lawyer. Assess the risk of the following contract clause. \
There are two risk types: potential issue and red flag. \
Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk. \
Use this legal reasoning approach: topic, rule, explanation, analysis, counterarguments, conclusion. \
Don\'t use more than 6 sentences and only give the risk type as conclusion.\
'
# for this one when cleaning the results, we need the use the cleaning function that takes the last one

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('zero-shot cot with legal template',init_system_content_legal_template, model, 'normal','extract_last')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('zero-shot cot with legal template',init_system_content_legal_template, model, 'normal','extract_last')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('zero-shot cot with legal template',init_system_content_legal_template, model, 'normal','extract_last')

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('zero-shot cot with legal template',init_system_content_legal_template, model, 'normal','extract_last')

## Few-shot Prompting
examples in the system message

In [None]:
ic_clause_pi_1 = df_ic_examples['clause'][1417]
ic_representing_pi_1 = df_ic_examples['representing'][1417]
ic_contract_type_pi_1 = df_ic_examples['contract_type'][1417]
ic_risk_pi = df_ic_examples['ground_truth_label'][1417]

ic_clause_pi_2 = df_ic_examples['clause'][2874]
ic_representing_pi_2 = df_ic_examples['representing'][2874]
ic_contract_type_pi_2 = df_ic_examples['contract_type'][2874]

ic_clause_pi_3 = df_ic_examples['clause'][2586]
ic_representing_pi_3 =  df_ic_examples['representing'][2586]
ic_contract_type_pi_3 = df_ic_examples['contract_type'][2586]


ic_clause_rf_1 = df_ic_examples['clause'][3595]
ic_representing_rf_1 = df_ic_examples['representing'][3595]
ic_contract_type_rf_1 = df_ic_examples['contract_type'][3595]
ic_risk_rf = df_ic_examples['ground_truth_label'][3595]

ic_clause_rf_2 = df_ic_examples['clause'][333]
ic_representing_rf_2 = df_ic_examples['representing'][333]
ic_contract_type_rf_2 = df_ic_examples['contract_type'][333]


ic_clause_rf_3 = df_ic_examples['clause'][121]
ic_representing_rf_3 = df_ic_examples['representing'][121]
ic_contract_type_rf_3 = df_ic_examples['contract_type'][121]

In [None]:
ic_clauses_1 = [ic_clause_pi_1, ic_clause_rf_1]
ic_risks_1 = [ic_risk_pi, ic_risk_rf]
ic_representing_1 = [ic_representing_pi_1, ic_representing_rf_1]
ic_contract_type_1 = [ic_contract_type_pi_1, ic_contract_type_rf_1]


ic_clauses_2 = [ic_clause_pi_1, ic_clause_pi_2, ic_clause_rf_1, ic_clause_rf_2]
ic_risks_2 = [ic_risk_pi, ic_risk_pi, ic_risk_rf, ic_risk_rf]
ic_representing_2 = [ic_representing_pi_1,ic_representing_pi_2, ic_representing_rf_1, ic_representing_rf_2]
ic_contract_type_2 = [ic_contract_type_pi_1,ic_contract_type_pi_2, ic_contract_type_rf_1,ic_contract_type_rf_2]

ic_clauses_3 = [ic_clause_pi_1, ic_clause_pi_2, ic_clause_pi_3, ic_clause_rf_1, ic_clause_rf_2, ic_clause_rf_3]
ic_risks_3 = [ic_risk_pi, ic_risk_pi, ic_risk_pi, ic_risk_rf, ic_risk_rf, ic_risk_rf]
ic_representing_3 = [ic_representing_pi_1,ic_representing_pi_2, ic_representing_pi_3, ic_representing_rf_1, ic_representing_rf_2, ic_representing_rf_3]
ic_contract_type_3 = [ic_contract_type_pi_1,ic_contract_type_pi_2, ic_contract_type_pi_3, ic_contract_type_rf_1,ic_contract_type_rf_2, ic_contract_type_rf_3]

In [None]:
#### Putting the in-context examples in the system message ######

In [None]:
init_system_content_fewshot = f'You are a lawyer. Assess the risk of the following contract clause. \nAnswer with one of these two options and nothing else: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'

system_content_fewshot_1 = prepare_fewshot_system_content(init_system_content_fewshot, ic_clauses_1, ic_risks_1, ic_representing_1, ic_contract_type_1) # 3 in-context examples
system_content_fewshot_2 = prepare_fewshot_system_content(init_system_content_fewshot, ic_clauses_2, ic_risks_2, ic_representing_2, ic_contract_type_2) # 6 in-context examples
system_content_fewshot_3 = prepare_fewshot_system_content(init_system_content_fewshot, ic_clauses_3, ic_risks_3, ic_representing_3, ic_contract_type_3) # 9 in-context examples

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('few-shot with 1 example per class',system_content_fewshot_1, model, 'few_shot_system','extract_last')    

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few-shot with 1 example per class',system_content_fewshot_1, model, 'few_shot_system','extract_last')

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('few-shot with 2 examples per class',system_content_fewshot_2, model, 'few_shot_system','extract_last')    

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few-shot with 2 examples per class',system_content_fewshot_2, model, 'few_shot_system','extract_last')

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('few-shot with 3 examples per class',system_content_fewshot_3, model, 'few_shot_system','extract_last')    

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few-shot with 3 examples per class',system_content_fewshot_3, model, 'few_shot_system','extract_last')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few-shot with 1 example per class',system_content_fewshot_1, model, 'few_shot_system','extract_last')   

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few-shot with 2 examples per class',system_content_fewshot_2, model, 'few_shot_system','extract_last')  

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few-shot with 3 examples per class',system_content_fewshot_3, model, 'few_shot_system','extract_last') 

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('few-shot with 1 example per class',system_content_fewshot_1, model, 'few_shot_system','extract_last') 
    result_dataframe('few-shot with 2 examples per class',system_content_fewshot_2, model, 'few_shot_system','extract_last')    
    result_dataframe('few-shot with 3 examples per class',system_content_fewshot_3, model, 'few_shot_system','extract_last') 

## Few-Shot Prompting
examples in the user message

In [None]:
init_system_content_fewshot = f'You are a lawyer. Assess the risk of the following contract clause. \nAnswer with one of these two options and nothing else: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'


In [None]:
prompt_fewshot_1 = prepare_fewshot_prompt(ic_clauses_1, ic_risks_1, ic_representing_1, ic_contract_type_1)
prompt_fewshot_2 = prepare_fewshot_prompt(ic_clauses_2, ic_risks_2, ic_representing_2, ic_contract_type_2)
prompt_fewshot_3 = prepare_fewshot_prompt(ic_clauses_3, ic_risks_3, ic_representing_3, ic_contract_type_3)

In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few-shot with 1 example per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_1)    

In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few-shot with 2 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_2)

In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo', 'gpt-4']:
    result_dataframe('few-shot with 3 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_3)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few-shot with 1 example per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_1)    

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few-shot with 2 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_2)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few-shot with 3 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_3)    

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few-shot with 1 example per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_1)  

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few-shot with 2 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_2)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few-shot with 3 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_3)

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('few-shot with 1 example per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_1) 
    result_dataframe('few-shot with 2 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_2)
    result_dataframe('few-shot with 3 examples per class, examples in prompt',init_system_content_fewshot,model,setting = 'few_shot_prompt',init_prompt=prompt_fewshot_3)

## Zero-shot CoT with legal reasoning template A

In [None]:
init_system_content_alex_template = """\
You are a lawyer. Your task is to asses the risk of the given contract clause.
Follow this format:
Clause: {}. 
Thought: Your thought process for assessing the clause. 
Result: 'potential issue' if the clause has potential legal risk or 'red flag' if there is high legal risk. 
Justification: An explanation for the result you've reached.
"""

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('zero-shot cot with alex template',init_system_content_alex_template,model, 'normal','extract_last')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('zero-shot cot with alex template',init_system_content_alex_template,model, 'normal','extract_last')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('zero-shot cot with alex template',init_system_content_alex_template,model, 'normal','extract_last')

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('zero-shot cot with alex template',init_system_content_alex_template,model, 'normal','extract_last')

## Zero-Shot CoT, let's think step by step 

in this one we prompt the models twice: first, to generate the cot produced by let's think step by step 
and then we append this to the input and 'Therefore, the answer (potential issue or red flag) is'



In [None]:
system_content_step_by_step = f'You are a lawyer. Assess the risk of the following contract clause. \n The options are: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'
# in this system message, I don't include stuff like answer with this only as we want to produce cot first

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('lets think step by step',system_content_step_by_step,model,'step_by_step','extract_first')
    

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('lets think step by step',system_content_step_by_step,model,'step_by_step','extract_first')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('lets think step by step',system_content_step_by_step,model,'step_by_step','extract_first')

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('lets think step by step',system_content_step_by_step,model,'step_by_step','extract_first')

## AnnoLLM
explanations generated with gpt4o 


In [None]:
system_content_annollm = 'You are a lawyer. Your expertise is to classify the legal risk of contractual clauses. There are two possible classifications: "potential issue" or "red flag". Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'

In [None]:
def generate_llm_explanations(ic_clauses, ic_risks, ic_representing, ic_contract_type, system_content = system_content_annollm, model = 'gpt-4o'):
    explanations = []
    for i in range(len(ic_risks)):
        prompt = f'Information: The type of this contract is {ic_contract_type[i]}. Representing side is the {ic_representing[i]}. The governing law is England and Wales. Given the clause: " {ic_clauses[i]} ", explain briefly why the correct classification is " {ic_risks[i]} ", with a response length not exceeding 100 words.'
        explanations.append(get_completion_gpt(prompt=prompt, system_content=system_content, model= model))
    return explanations

In [None]:
# examples in the user message

def prepare_fewshot_cot_prompt_annollm(ic_clauses, ic_risks, ic_representing, ic_contract_type, test_clause = None):
    explanations = generate_llm_explanations(ic_clauses, ic_risks, ic_representing, ic_contract_type)
    prompt = ""
    for clause,risk,explanation, representing, contract_type in zip(ic_clauses,ic_risks,explanations, ic_representing, ic_contract_type):
        prompt += f"Information: The type of this contract is {contract_type}. Representing side is the {representing}. The governing law is England and Wales. \nClause: {clause} \nAnswer: {explanation} Therefore, the classification is {risk}. \n\n"
    #prompt += f"Clause: {test_clause} \nAnswer:" # append this later for efficiency in the next cell
    return prompt


In [None]:
init_prompt_annollm_1 = prepare_fewshot_cot_prompt_annollm(ic_clauses_1,ic_risks_1, ic_representing_1, ic_contract_type_1)
init_prompt_annollm_2 = prepare_fewshot_cot_prompt_annollm(ic_clauses_2,ic_risks_2, ic_representing_2, ic_contract_type_2)
init_prompt_annollm_3 = prepare_fewshot_cot_prompt_annollm(ic_clauses_3,ic_risks_3, ic_representing_3, ic_contract_type_3)

In [None]:
model = 'gpt-3.5-turbo-0125'
result_dataframe('annollm with 1 example per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_1)

In [None]:
for model in ['gpt-4', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('annollm with 1 example per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_1)

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('annollm with 2 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_2)

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('annollm with 3 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_3)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('annollm with 1 example per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_1)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('annollm with 2 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_2)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('annollm with 3 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_3)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('annollm with 1 example per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_1)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('annollm with 2 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_2)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('annollm with 3 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_3)


In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('annollm with 1 example per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_1)
    result_dataframe('annollm with 2 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_2)
    result_dataframe('annollm with 3 examples per class, examples inside prompt',system_content_annollm,model,'annollm','extract_last',init_prompt=init_prompt_annollm_3)

In [None]:
# annollm but with examples inside the system message 

In [None]:
def prepare_fewshot_cot_system_content_annollm(init_system_message,ic_clauses, ic_risks, ic_representing, ic_contract_type):
    init_system_message += '\nHere are some examples to help you: \n '
    explanations = generate_llm_explanations(ic_clauses, ic_risks, ic_representing, ic_contract_type)
    for clause,risk,explanation, representing, contract_type in zip(ic_clauses,ic_risks,explanations, ic_representing, ic_contract_type):
        init_system_message +=  f"Information: The type of this contract is {contract_type}. Representing side is the {representing}. The governing law is England and Wales. \nClause: {clause} \nAnswer: {explanation} Therefore, the classification is {risk}. \n\n"
    return init_system_message

In [None]:
system_content_annollm_1 = prepare_fewshot_cot_system_content_annollm(system_content_annollm, ic_clauses_1, ic_risks_1, ic_representing_1, ic_contract_type_1)
system_content_annollm_2 = prepare_fewshot_cot_system_content_annollm(system_content_annollm, ic_clauses_2, ic_risks_2, ic_representing_2, ic_contract_type_2)
system_content_annollm_3 = prepare_fewshot_cot_system_content_annollm(system_content_annollm, ic_clauses_3, ic_risks_3, ic_representing_3, ic_contract_type_3)


In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('annollm with 1 example per class, examples inside system message',system_content_annollm_1,model,'annollm_system','extract_last')

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('annollm with 2 examples per class, examples inside system message',system_content_annollm_2,model,'annollm_system','extract_last')

In [None]:
for model in ['gpt-4','gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo']:
    result_dataframe('annollm with 3 examples per class, examples inside system message',system_content_annollm_3,model,'annollm_system','extract_last')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('annollm with 1 example per class, examples inside system message',system_content_annollm_1,model,'annollm_system','extract_last')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('annollm with 2 examples per class, examples inside system message',system_content_annollm_2,model,'annollm_system','extract_last')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('annollm with 3 examples per class, examples inside system message',system_content_annollm_3,model,'annollm_system','extract_last')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('annollm with 1 example per class, examples inside system message',system_content_annollm_1,model,'annollm_system','extract_last')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('annollm with 2 examples per class, examples inside system message',system_content_annollm_2,model,'annollm_system','extract_last')

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('annollm with 3 examples per class, examples inside system message',system_content_annollm_3,model,'annollm_system','extract_last')

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('annollm with 1 example per class, examples inside system message',system_content_annollm_1,model,'annollm_system','extract_last')
    result_dataframe('annollm with 2 examples per class, examples inside system message',system_content_annollm_2,model,'annollm_system','extract_last')
    result_dataframe('annollm with 3 examples per class, examples inside system message',system_content_annollm_3,model,'annollm_system','extract_last')

# Giving the whole document as context
gets context length error with gpt4 but not with gpt-4o

In [None]:
init_system_content_context = 'You are a lawyer. Assess the risk of the following contract clause. Answer with one of these two options and nothing else: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'

In [None]:
for model in ['gpt-4o', 'gpt-4-turbo']:
    result_dataframe('appending the whole contract in the system message',init_system_content_context,model,'append_contract','extract_first')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
     result_dataframe('appending the whole contract in the system message',init_system_content_context,model,'append_contract','extract_first')

In [None]:
for model in ['gpt-4o', 'gpt-4-turbo']:
    result_dataframe('appending the whole contract to the prompt',init_system_content_context,model,'append_contract_in_prompt','extract_first')

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('appending the whole contract to the prompt',init_system_content_context,model,'append_contract_in_prompt','extract_first')

------------------

# Few-Shot Prompting with Embeddings
using voyage ai's domain specific model



### ic examples in the system message

In [None]:
# Preprocess the embeddings for ic examples
df_ic_examples['embedding'] = df_ic_examples['embedding'].apply(lambda x: np.array(ast.literal_eval(x)))

In [None]:
init_system_content_fewshot = f'You are a lawyer. Assess the risk of the following contract clause. \nAnswer with one of these two options and nothing else: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.\nHere are some examples to help you: \n '


In [None]:
model = 'gpt-3.5-turbo-0125'
result_dataframe('few shot with embeddings not per class, two examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=2)

for model in [ 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings not per class, two examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=2)

for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings not per class, four examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=4)

for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings not per class, six examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=6)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings not per class, two examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=2)    

for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings not per class, four examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=4)

for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings not per class, six examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=6)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings not per class, two examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=2)

for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings not per class, four examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=4)

for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings not per class, six examples in total, in system message',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='not_per_class',n_fewshot_embed=6)

In [None]:
model = 'gpt-3.5-turbo-0125'
result_dataframe('few shot with embeddings, per class, one example per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=1)

for model in ['gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings, per class, one example per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=1)

for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings, per class, two examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=2)

for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings, per class, three examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=3)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings, per class, one example per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=1)    

for model in ['meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings, per class, two examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=2)

for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings, per class, three examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=3)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings, per class, one example per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=1)

for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings, per class, two examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=2)

for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings, per class, three examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=3)

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('few shot with embeddings, per class, one example per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=1)
    result_dataframe('few shot with embeddings, per class, two examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=2)
    result_dataframe('few shot with embeddings, per class, three examples per class, in system message', init_system_content_fewshot, model ,cleanining_method='extract_last',setting='few_shot_embed_system',which_ic_embed_type='per_class',n_fewshot_embed=3)


### ic examples in the user message

In [None]:
init_system_content_fewshot = f'You are a lawyer. Assess the risk of the following contract clause. \nAnswer with one of these two options and nothing else: potential issue or red flag. Potential issue means there\'s potentially a legal risk and red flag signifies high legal risk.'

In [None]:
model = 'gpt-3.5-turbo-0125'
result_dataframe('few shot with embeddings not per class, two examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=2)

In [None]:
for model in ['gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings not per class, two examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=2)


In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings not per class, four examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=4)


In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings not per class, six examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=6)


In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings not per class, two examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=2)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings not per class, four examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=4)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings not per class, six examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=6)


In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings not per class, two examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=2)


In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings not per class, four examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=4)


In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings not per class, six examples in total, in prompt',init_system_content_fewshot,model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='not_per_class',n_fewshot_embed=6)


In [None]:
model = 'gpt-3.5-turbo-0125'
result_dataframe('few shot with embeddings, per class, one example per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=1)

In [None]:
for model in ['gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings, per class, one example per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=1)

In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings, per class, two examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=2)

In [None]:
for model in ['gpt-3.5-turbo-0125', 'gpt-4o', 'gpt-4-turbo','gpt-4']:
    result_dataframe('few shot with embeddings, per class, three examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=3)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings, per class, one example per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=1)

In [None]:
for model in ['meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings, per class, two examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=2)

In [None]:
for model in ['meta/meta-llama-3-8b-instruct', 'meta/meta-llama-3-70b-instruct']:
    result_dataframe('few shot with embeddings, per class, three examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=3)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings, per class, one example per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=1)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings, per class, two examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=2)

In [None]:
for model in ['claude-3-opus-20240229','claude-3-5-sonnet-20240620']:
    result_dataframe('few shot with embeddings, per class, three examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=3)

In [None]:
for model in ["unsloth_model_3_epochs:latest" ,'unsloth_model_1_epoch:latest','unsloth_model_1_epoch_B:latest']:
    result_dataframe('few shot with embeddings, per class, one example per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=1)
    result_dataframe('few shot with embeddings, per class, two examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=2)
    result_dataframe('few shot with embeddings, per class, three examples per class, in prompt', init_system_content_fewshot, model,cleanining_method='extract_last',setting='few_shot_embed_prompt',which_ic_embed_type='per_class',n_fewshot_embed=3)

-------