In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
import hashlib
from metrics import cleanup_result_first_new, cleanup_result_last_new, cleanup_result_after_result, metrics_mine_dict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import random
import seaborn as sns
import ast

In [None]:
df_test = pd.read_csv('test_dataset.csv')

test_clauses = df_test['clause'].tolist()
test_risks = df_test['ground_truth_label'].tolist()
test_contract_types = df_test['contract_type'].tolist()
test_representing = df_test['representing'].tolist()
test_sources = df_test['source'].tolist()

In [None]:
folder_path = 'results'

results = []

pickle_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.pkl')])

for filename in pickle_files:
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'rb') as file:
        result = pickle.load(file)
        results.append(result)

In [None]:
def load_text_files(directory):
    """
    function to create a dictionary whose keys are the source document
    identifier and the values are the texts of the source documents
    """
    text_files = {}
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                # Remove the '.txt' extension from the filename
                key_name = filename.rstrip('.txt')
                text_files[key_name] = file.read()
    return text_files

## updated results and summary dataframe

In [None]:
N = 200

updated_results = []
for i in range(len(results)):
    dataframe = pd.DataFrame(results[i])
    experiment_name = dataframe['experiment_name'][0]
    if 'step by step' in experiment_name or 'appending' in experiment_name:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_first_new)
    elif 'alex' in experiment_name:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_after_result)
        
    else:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_last_new)
    
    new_cleaned_results = dataframe['better_cleaned'].tolist()
    
    new_metrics = metrics_mine_dict(test_risks[:N], new_cleaned_results)
    
    dataframe['accuracy_new'] = new_metrics['Accuracy']
    dataframe['precision_new'] = new_metrics['Precision']
    dataframe['recall_new'] = new_metrics['Recall']
    dataframe['F1_new'] = new_metrics['F1']
       
    updated_results.append(dataframe)
    

In [None]:
#columns_to_consider = ['experiment_name', 'model', 'F1_new','total cost of this experiment','experiment ran for (seconds)']
columns_to_consider = ['experiment_name', 'model', 'F1_new','experiment ran for (seconds)']
unique_values = {col: [] for col in columns_to_consider}

# Iterate through each DataFrame and collect unique values
for df in updated_results:
    for col in columns_to_consider:
        unique_val = df[col].unique()
        if len(unique_val) == 1:  # Ensure it's a unique value column
            unique_values[col].append(unique_val[0])

# Convert the dictionary to a DataFrame
summary = pd.DataFrame(unique_values)

In [None]:
summary

# producing graphs
Add the ollama results for llama3 8b and change below!!

In [None]:
summary_ = summary[summary['model'] != 'meta/meta-llama-3-8b-instruct']


In [None]:
df = summary_

## reasoning

In [None]:
experiments_1 = [
    'Basic zero-shot prompting', 'lets think step by step',
    'zero-shot cot with alex template', 'zero-shot cot with legal template'
]

filtered_df = df[df['experiment_name'].isin(experiments_1)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    'lets think step by step': "let's think step by step",
    'zero-shot cot with alex template': 'zero-shot with legal template A',
    'zero-shot cot with legal template': 'zero-shot with legal template B',
}


model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Lllama-3 70B',
    #'meta/meta-llama-3-8b-instruct' : 'Llama-3 8B',
    'llama3:8b': 'LLama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    #bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i], hatch=hatches[i]))
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Prompt Engineering Method')
ax.set_ylabel('F1 Score')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

plt.savefig('basic ones', format = 'pdf')

# Display the chart
plt.show()

## annollm

In [None]:
experiments_2 = [
    'Basic zero-shot prompting',
    #'annollm with 1 example per class, examples inside prompt',
    'annollm with 1 example per class, examples inside system message',
    #'annollm with 2 examples per class, examples inside prompt',
    'annollm with 2 examples per class, examples inside system message',
    #'annollm with 3 examples per class, examples inside prompt',
    'annollm with 3 examples per class, examples inside system message'
]

filtered_df = df[df['experiment_name'].isin(experiments_2)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    #'annollm with 1 example per class, examples inside prompt': '1 example per class in the prompt',
    'annollm with 1 example per class, examples inside system message': '1 example per class',
    #'annollm with 2 examples per class, examples inside prompt': '2 examples per class in the prompt',
    'annollm with 2 examples per class, examples inside system message': '2 examples per class',
    #'annollm with 3 examples per class, examples inside prompt': '3 examples per class in the prompt',
    'annollm with 3 examples per class, examples inside system message': '3 examples per class'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Lllama-3 70B',
    'llama3:8b': 'Llama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Number of In-context Examples')
ax.set_ylabel('F1 Score')
ax.set_title('AnnoLLM, Examples in System Message')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')
ax.set_ylim(0, 0.75)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()
plt.savefig('annollm in system message.pdf', format = 'pdf')
# Display the chart
plt.show()

In [None]:
experiments_2 = [
    'Basic zero-shot prompting',
    'annollm with 1 example per class, examples inside prompt',
    #'annollm with 1 example per class, examples inside system message',
    'annollm with 2 examples per class, examples inside prompt',
    #'annollm with 2 examples per class, examples inside system message',
    'annollm with 3 examples per class, examples inside prompt',
    #'annollm with 3 examples per class, examples inside system message'
]

filtered_df = df[df['experiment_name'].isin(experiments_2)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    'annollm with 1 example per class, examples inside prompt': '1 example per class',
    #'annollm with 1 example per class, examples inside system message': '1 example per class in the system message',
    'annollm with 2 examples per class, examples inside prompt': '2 examples per class',
    #'annollm with 2 examples per class, examples inside system message': '2 examples per class in the system message',
    'annollm with 3 examples per class, examples inside prompt': '3 examples per class',
    #'annollm with 3 examples per class, examples inside system message': '3 examples per class in the system message'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Lllama-3 70B',
    'llama3:8b': 'Llama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Number of In-context Examples')
ax.set_ylabel('F1 Score')
ax.set_title('AnnoLLM, Examples in User Message')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.75)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()
plt.savefig('annollm in user message.pdf', format = 'pdf')
# Display the chart
plt.show()

## few-shot without embeddings

In [None]:
experiments_3 = [
    'few-shot with 1 example per class',
    'few-shot with 1 example per class, examples in prompt',
    'few-shot with 2 examples per class',
    'few-shot with 2 examples per class, examples in prompt',
    'few-shot with 3 examples per class',
    'few-shot with 3 examples per class, examples in prompt'
]

filtered_df = df[df['experiment_name'].isin(experiments_3)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'few-shot with 1 example per class': '1 example per class in the system message',
    'few-shot with 1 example per class, examples in prompt': '1 example per class in the prompt',
    'few-shot with 2 examples per class': '2 examples per class in the system message',
    'few-shot with 2 examples per class, examples in prompt': '2 examples per class in the prompt',
    'few-shot with 3 examples per class': '3 examples per class in the system message',
    'few-shot with 3 examples per class, examples in prompt': '3 examples per class in the prompt'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Lllama-3 70B',
    #'meta/meta-llama-3-8b-instruct': 'Llama-3 8B'
    'llama3:8b': 'LLama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Prompt Engineering Method')
ax.set_ylabel('F1 Score')
ax.set_title('Few-Shot without Embeddings')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.75)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

# Display the chart
plt.show()

In [None]:
experiments_3 = [
    'Basic zero-shot prompting',
    #'few-shot with 1 example per class',
    'few-shot with 1 example per class, examples in prompt',
    #'few-shot with 2 examples per class',
    'few-shot with 2 examples per class, examples in prompt',
    #'few-shot with 3 examples per class',
    'few-shot with 3 examples per class, examples in prompt'
]

filtered_df = df[df['experiment_name'].isin(experiments_3)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    #'few-shot with 1 example per class': '1 example per class in the system message',
    'few-shot with 1 example per class, examples in prompt': '1 example per class',
    #'few-shot with 2 examples per class': '2 examples per class in the system message',
    'few-shot with 2 examples per class, examples in prompt': '2 examples per class',
    #'few-shot with 3 examples per class': '3 examples per class in the system message',
    'few-shot with 3 examples per class, examples in prompt': '3 examples per class'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Lllama-3 70B',
    #'meta/meta-llama-3-8b-instruct': 'Llama-3 8B'
    'llama3:8b': 'LLama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Number of In-context Examples')
ax.set_ylabel('F1 Score')
ax.set_title('Simple Few-Shot Prompting, Examples in User Message')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.75)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

plt.savefig('simple fewshot in user message.pdf', format = 'pdf')

# Display the chart
plt.show()

In [None]:
experiments_3 = [
    'Basic zero-shot prompting',
    'few-shot with 1 example per class',
    #'few-shot with 1 example per class, examples in prompt',
    'few-shot with 2 examples per class',
    #'few-shot with 2 examples per class, examples in prompt',
    'few-shot with 3 examples per class',
    #'few-shot with 3 examples per class, examples in prompt'
]

filtered_df = df[df['experiment_name'].isin(experiments_3)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    'few-shot with 1 example per class': '1 example per class',
    #'few-shot with 1 example per class, examples in prompt': '1 example per class',
    'few-shot with 2 examples per class': '2 examples per class',
    #'few-shot with 2 examples per class, examples in prompt': '2 examples per class',
    'few-shot with 3 examples per class': '3 examples per class',
    #'few-shot with 3 examples per class, examples in prompt': '3 examples per class'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Lllama-3 70B',
    #'meta/meta-llama-3-8b-instruct': 'Llama-3 8B'
    'llama3:8b': 'LLama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Number of In-context Examples')
ax.set_ylabel('F1 Score')
ax.set_title('Simple Few-Shot Prompting, Examples in System Message')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.75)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

plt.savefig('simple fewshot in system message.pdf', format = 'pdf')

# Display the chart
plt.show()

## few-shot with embeddings

In [None]:
experiments_4 = [
    'few shot with embeddings not per class, two examples in total, in prompt',
    'few shot with embeddings not per class, two examples in total, in system message',
    'few shot with embeddings not per class, four examples in total, in prompt',
    'few shot with embeddings not per class, four examples in total, in system message',
    'few shot with embeddings not per class, six examples in total, in prompt',
    'few shot with embeddings not per class, six examples in total, in system message',
    'few shot with embeddings, per class, one example per class, in prompt',
    'few shot with embeddings, per class, one example per class, in system message',
    'few shot with embeddings, per class, two examples per class, in prompt',
    'few shot with embeddings, per class, two examples per class, in system message',
    'few shot with embeddings, per class, three examples per class, in prompt',
    'few shot with embeddings, per class, three examples per class, in system message'
]

filtered_df = df[df['experiment_name'].isin(experiments_4)]
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'few shot with embeddings not per class, two examples in total, in prompt': '2 in total in prompt',
    'few shot with embeddings not per class, two examples in total, in system message': '2 in total in system message',
    'few shot with embeddings not per class, four examples in total, in prompt': '4 in total in prompt',
    'few shot with embeddings not per class, four examples in total, in system message': '4 in total in system message',
    'few shot with embeddings not per class, six examples in total, in prompt': '6 in total in prompt',
    'few shot with embeddings not per class, six examples in total, in system message': '6 in total in system message',
    'few shot with embeddings, per class, one example per class, in prompt': '1 per class in prompt',
    'few shot with embeddings, per class, one example per class, in system message': '1 per class in system message',
    'few shot with embeddings, per class, two examples per class, in prompt': '2 per class in prompt',
    'few shot with embeddings, per class, two examples per class, in system message': '2 per class in system message',
    'few shot with embeddings, per class, three examples per class, in prompt': '3 per class in prompt',
    'few shot with embeddings, per class, three examples per class, in system message': '3 per class in system message'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Llama-3 70B',
    'meta/meta-llama-3-8b-instruct': 'Llama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['darkseagreen', 'gold', 'tab:olive', 'olivedrab', 'khaki', 'darkolivegreen', 'darkkhaki', 'olive']
hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i], hatch=hatches[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Prompt Engineering Method')
ax.set_ylabel('F1 Score')
ax.set_title('Few-Shot with Embeddings')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.90)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

# Display the chart
plt.show()


In [None]:
experiments_4 = [
    'Basic zero-shot prompting',
    'few shot with embeddings, per class, one example per class, in prompt',
    'few shot with embeddings, per class, two examples per class, in prompt',
    'few shot with embeddings, per class, three examples per class, in prompt',
]

filtered_df = df[df['experiment_name'].isin(experiments_4)].copy()
filtered_df.loc[:, 'experiment_name'] = pd.Categorical(filtered_df['experiment_name'], categories=experiments_4, ordered=True)
filtered_df = filtered_df.sort_values('experiment_name')
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    'few shot with embeddings, per class, one example per class, in prompt': '1 per class',
    'few shot with embeddings, per class, two examples per class, in prompt': '2 per class',
    'few shot with embeddings, per class, three examples per class, in prompt': '3 per class'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Llama-3 70B',
    'llama3:8b': 'Llama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Number of In-context Examples')
ax.set_ylabel('F1 Score')
ax.set_title('Few-Shot Prompting with Embeddings, Examples in User Message')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.90)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

plt.savefig('few shot with embeddings in user message.pdf', format = 'pdf')
# Display the chart
plt.show()


In [None]:
experiments_4 = [
    'Basic zero-shot prompting',
    'few shot with embeddings, per class, one example per class, in system message',
    'few shot with embeddings, per class, two examples per class, in system message',
    'few shot with embeddings, per class, three examples per class, in system message'
]

filtered_df = df[df['experiment_name'].isin(experiments_4)].copy()
filtered_df['experiment_name'] = pd.Categorical(filtered_df['experiment_name'], categories=experiments_4, ordered=True)
filtered_df = filtered_df.sort_values('experiment_name')
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    'few shot with embeddings, per class, one example per class, in system message': '1 per class',
    'few shot with embeddings, per class, two examples per class, in system message': '2 per class',
    'few shot with embeddings, per class, three examples per class, in system message': '3 per class'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    'meta/meta-llama-3-70b-instruct': 'Llama-3 70B',
    'llama3:8b': 'Llama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']
#hatches = ['-', '', '', '//', '', '/', '', '|']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Number of In-context Examples')
ax.set_ylabel('F1 Score')
ax.set_title('Few-Shot Prompting with Embeddings, Examples in System Message')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

ax.set_ylim(0, 0.90)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

plt.savefig('few shot with embeddings in system message.pdf', format = 'pdf')

# Display the chart
plt.show()


## appending the contract

In [None]:
experiments_4 = [
    'Basic zero-shot prompting',
    'appending the whole contract in the system message',
    'appending the whole contract to the prompt'
]

filtered_df = df[df['experiment_name'].isin(experiments_4)].copy()
filtered_df.loc[:, 'experiment_name'] = pd.Categorical(filtered_df['experiment_name'], categories=experiments_4, ordered=True)
filtered_df = filtered_df.sort_values('experiment_name')
pivot_df = filtered_df.pivot(index='experiment_name', columns='model', values='F1_new')

new_labels = {
    'Basic zero-shot prompting': 'basic zero-shot',
    'appending the whole contract in the system message': 'contract in the system message',
    'appending the whole contract to the prompt': 'contract in the user message'
}

model_name_map = {
    'claude-3-5-sonnet-20240620': 'Claude-3.5 Sonnet',
    'claude-3-opus-20240229': 'Claude-3 Opus',
    #'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',
    'gpt-4-turbo': 'GPT-4 Turbo',
    #'gpt-4': 'GPT-4',
    'gpt-4o': 'GPT-4o',
    #'meta/meta-llama-3-70b-instruct': 'Llama-3 70B',
    #'llama3:8b': 'Llama-3 8B'
}

# Plotting
fig, ax = plt.subplots(figsize=(15, 10))

# Define bar width and positions
bar_width = 0.09
index = np.arange(len(pivot_df))

# Create bars for each model
bars = []
models = list(model_name_map.keys())
colors = ['teal', 'steelblue', 'darkseagreen', 'olive', 'yellow', 'yellowgreen', 'mediumpurple','indigo']

for i, model in enumerate(models):
    bars.append(ax.bar(index + i * bar_width, pivot_df[model], bar_width, label=model_name_map[model], color=colors[i]))

# Add F1 score values on the bars
for i in range(len(pivot_df)):
    for j, model in enumerate(models):
        if not pd.isna(pivot_df[model].iloc[i]):
            ax.text(index[i] + j * bar_width, pivot_df[model].iloc[i] + 0.001, f'{pivot_df[model].iloc[i]:.2f}', ha='center', va='bottom')

# Add labels, title, and legend
ax.set_xlabel('Prompt Engineering Method')
ax.set_ylabel('F1 Score')
ax.set_title('Providing the Source Contract to the LLM')
ax.set_xticks(index + (len(models) / 2 - 0.5) * bar_width)
ax.set_xticklabels([new_labels[name] for name in pivot_df.index], rotation=10, ha='right')

#ax.set_ylim(0, 0.90)

# Move legend to the right outside of the plot
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Adjust layout to make room for the rotated labels and legend
plt.tight_layout()

plt.savefig('providing the source contract.pdf', format = 'pdf')
# Display the chart
plt.show()


## cost graphs for GPT family

In [None]:
gpt_models = ['gpt-3.5-turbo-0125', 'gpt-4-turbo', 'gpt-4', 'gpt-4o']
columns_to_consider = ['experiment_name', 'model', 'F1_new','total cost of this experiment','experiment ran for (seconds)']
#columns_to_consider = ['experiment_name', 'model', 'F1_new','experiment ran for (seconds)']
unique_values = {col: [] for col in columns_to_consider}

# Iterate through each DataFrame and collect unique values
for df in updated_results:
    if df['model'][0] in ['gpt-3.5-turbo-0125', 'gpt-4-turbo', 'gpt-4', 'gpt-4o']:
        df_gpt = df[df['model'].isin(gpt_models)]
        for col in columns_to_consider:
            unique_val = df_gpt[col].unique()
            if len(unique_val) == 1:  # Ensure it's a unique value column
                unique_values[col].append(unique_val[0])

# Convert the dictionary to a DataFrame
summary_gpt_subset = pd.DataFrame(unique_values)

In [None]:
summary_gpt_subset['group'] = summary_gpt_subset.index // 4

In [None]:
df_grouped = summary_gpt_subset.groupby('group')['F1_new'].mean().reset_index()

df_grouped.columns = ['group', 'avg F1']

In [None]:
models = gpt_models

# Example list of chosen experiment types
chosen_experiments = ['Basic zero-shot prompting', 'zero-shot cot with legal template', 'lets think step by step', 'annollm with 2 examples per class, examples inside prompt',\
                      'few shot with embeddings, per class, three examples per class, in system message', 'few-shot with 3 examples per class, examples in prompt']  # replace with your actual experiment names

# Filter the DataFrame for the chosen experiments
filtered_df = summary_gpt_subset[summary_gpt_subset['experiment_name'].isin(chosen_experiments)]

# Sort the filtered DataFrame by 'total cost of this experiment' to ensure left-to-right order
filtered_df = filtered_df.sort_values(by=['experiment_name', 'total cost of this experiment'])


# Define specific colors for models and experiments
model_colors = {
    'gpt-3.5-turbo-0125': 'blue',  
    'gpt-4o': 'orange',         
    'gpt-4-turbo': 'red',         
    'gpt-4': 'green'               
}

experiment_colors = {
    'Basic zero-shot prompting': 'black',          
    'zero-shot cot with legal template': 'crimson',            
    'lets think step by step': 'darkorange',
    'annollm with 2 examples per class, examples inside prompt':'darkolivegreen',
    'few shot with embeddings, per class, three examples per class, in system message': 'indigo',
    'few-shot with 3 examples per class, examples in prompt': 'blue' 
}

# Set up the plot
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")

# Plotting each experiment type with custom line colors
for experiment in chosen_experiments:
    subset = filtered_df[filtered_df['experiment_name'] == experiment]
    plt.plot(subset['total cost of this experiment'], subset['F1_new'], linestyle='-', linewidth=2, color=experiment_colors[experiment], alpha=0.5)
    
    # Plot each point with custom colors for each model
    for model in models:
        model_subset = subset[subset['model'] == model]
        plt.scatter(model_subset['total cost of this experiment'], model_subset['F1_new'], color=model_colors[model], s=50, alpha = 0.8)

# Add labels and title
plt.xlabel('Cost of the Experiment (US Dollars)')
plt.ylabel('F1 Score')
plt.title('F1 Score vs. Cost of the Experiment')

# Show plot without legends
plt.grid(False)
plt.savefig('cost analysis for gpt models.pdf', format = 'pdf')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create an empty plot just for the legend
plt.figure(figsize=(4, 2))


experiment_name_dict = {'Basic zero-shot prompting': 'Basic zero-shot prompting',          
    'zero-shot cot with legal template': 'Legal reasoning template B',            
    'lets think step by step': "Let's think step by step (Zero-shot-CoT)",
    'annollm with 2 examples per class, examples inside prompt':'AnnoLLM',
    'few shot with embeddings, per class, three examples per class, in system message': 'Few-shot prompting with embeddings',
    'few-shot with 3 examples per class, examples in prompt': 'Simple few-shot prompting' 
}

# Plot dummy lines for each experiment type with custom colors to create the legend
for experiment in chosen_experiments:
    plt.plot([], [], label=experiment_name_dict[experiment], linestyle='-', linewidth=2, color=experiment_colors[experiment], alpha = 0.5)

# Add legend
plt.legend(title='Experiment Type', loc='center')

# Remove axes
plt.axis('off')

# Show the experiment legend plot
plt.savefig('cost analysis for gpt models legend1.pdf', format = 'pdf')
plt.show()


In [None]:
import matplotlib.pyplot as plt

model_names = {
    'gpt-3.5-turbo-0125': 'GPT-3.5 Turbo',  
    'gpt-4o': 'GPT-4o',         
    'gpt-4-turbo': 'GPT-4 Turbo',         
    'gpt-4': 'GPT-4'               
}

# Create an empty plot just for the legend
plt.figure(figsize=(3, 2))

# Plot dummy markers for each model with custom colors to create the legend
for model in models:
    plt.plot([], [], color=model_colors[model], marker='o', linestyle='', markersize=8, label=model_names[model], alpha= 0.8)

# Add legend
plt.legend(title='Models', loc='center')

# Remove axes
plt.axis('off')
plt.savefig('cost analysis for gpt models legend2.pdf', format = 'pdf')
# Show the model legend plot
plt.show()


## cost graphs for appending the contract for GPT-4o

In [None]:
F1_scores = {'basic zero-shot': 0.48, "let's think step by step": 0.41, 'legal reasoning template': 0.55, 'simple few-shot': 0.64, 'few-shot with embeddings': 0.80, 'annoLLM': 0.70, 'providing the source contract': 0.71}
costs = {'basic zero-shot': 0.136860, "let's think step by step": 0.727845, 'legal reasoning template': 0.705780, 'simple few-shot': 0.850600, 'few-shot with embeddings': 0.513380, 'annoLLM': 1.366335, 'providing the source contract': 9.818725}

In [None]:
9.818725/0.513380

In [None]:
plt.figure(figsize=(10, 6))

# Plotting each point
for key in F1_scores:
    plt.scatter(costs[key], F1_scores[key], label=key)

# Adding lines between the points
sorted_keys = sorted(costs, key=costs.get)
sorted_costs = [costs[key] for key in sorted_keys]
sorted_F1_scores = [F1_scores[key] for key in sorted_keys]

plt.plot(sorted_costs, sorted_F1_scores, linestyle='-', color='gray', alpha=0.5)

# Adding labels and legend
plt.xlabel('Cost of the Experiment (US Dollars)')
plt.ylabel('F1 Score')
plt.title('F1 Scores vs Costs for GPT-4o')
plt.legend(title='Prompting Methods')

plt.grid(False)
plt.savefig('appending the contract cost for gpt4o.pdf', format = 'pdf')
plt.show()

## Model Size-Performance graphs for the Llama-3 Family

In [None]:
llama_models = ['llama3:8b','meta/meta-llama-3-70b-instruct']
columns_to_consider = ['experiment_name', 'model', 'F1_new','experiment ran for (seconds)']
unique_values = {col: [] for col in columns_to_consider}

# Iterate through each DataFrame and collect unique values
for df in updated_results:
    if df['model'][0] in llama_models:
        df_llama = df[df['model'].isin(llama_models)]
        for col in columns_to_consider:
            unique_val = df_llama[col].unique()
            if len(unique_val) == 1:  # Ensure it's a unique value column
                unique_values[col].append(unique_val[0])

# Convert the dictionary to a DataFrame
summary_llama_subset = pd.DataFrame(unique_values)

In [None]:
summary_llama_subset['group'] = summary_llama_subset.index // 2

In [None]:
summary_llama_subset.columns

In [None]:
df_grouped = summary_llama_subset.groupby('group')['F1_new'].mean().reset_index()

df_grouped.columns = ['group', 'avg F1']

In [None]:
models = ['meta/meta-llama-3-70b-instruct', 'llama3:8b']
chosen_experiments = ['Basic zero-shot prompting', 'lets think step by step', 'zero-shot cot with alex template', 'few-shot with 3 examples per class',\
                      'few shot with embeddings, per class, three examples per class, in system message', 'annollm with 2 examples per class, examples inside prompt']


experiment_name_dict = {'Basic zero-shot prompting': 'Basic zero-shot prompting',          
    'zero-shot cot with alex template': 'Legal reasoning template A',            
    'lets think step by step': "Let's think step by step (Zero-shot-CoT)",
    'annollm with 2 examples per class, examples inside prompt':'AnnoLLM',
    'few shot with embeddings, per class, three examples per class, in system message': 'Few-shot prompting with embeddings',
    'few-shot with 3 examples per class': 'Simple few-shot prompting' 
}

model_name_dict = {'meta/meta-llama-3-70b-instruct': 'Llama-3 70B',
                   'llama3:8b': 'Llama-3 8B'}


In [None]:
filtered_df = summary_llama_subset[
    (summary_llama_subset['model'].isin(models)) &
    (summary_llama_subset['experiment_name'].isin(chosen_experiments))
]

filtered_df['experiment_name'] = filtered_df['experiment_name'].map(experiment_name_dict)
filtered_df['model'] = filtered_df['model'].map(model_name_dict)
filtered_df['experiment_name'] = pd.Categorical(filtered_df['experiment_name'], categories=[experiment_name_dict[exp] for exp in chosen_experiments[::-1]], ordered=True)

# Pivot the dataframe to have models as columns, experiments as rows, and F1_new as values
pivot_df = filtered_df.pivot_table(index='experiment_name', columns='model', values='F1_new')
pivot_df = pivot_df[[model_name_dict[model] for model in models]]

colors = ['mediumpurple','indigo']

# Plot the horizontal bar chart
ax = pivot_df.plot(kind='barh', figsize=(12,8), color=colors)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fmt='%.2f', padding=1)


# Set labels and title
plt.xlabel('F1 Score')
plt.ylabel('Experiment')
plt.title('F1 Scores by Experiment and Model for Llama-3 Family')

# Display the plot
plt.grid(False)
plt.tight_layout()
plt.savefig('performance model size for llama3.pdf', format = 'pdf')
plt.show()

## qualitative analysis I

In [None]:
for i in range(len(updated_results)):
    df = results[i]
    print(f"{i}, {df['experiment_name'][0]}, {df['model'][0]}")

In [None]:
llama70_A = updated_results[195]
llama70_B = updated_results[204]
llama70_step = updated_results[186]

In [None]:
llama70_A.columns

In [None]:
# First, merge llama70_A with llama70_B
merged_df = pd.merge(llama70_A, llama70_B, on=['model', 'clause', 'ground_truth_label'], suffixes=('_A', '_B'))


In [None]:
filtered_df = merged_df[merged_df['better_cleaned_A'] != merged_df['better_cleaned_B']]

In [None]:
llama70_A_vs_B = filtered_df[['model', 'clause' ,'experiment_name_A', 'experiment_name_B' ,'system_content_A', 'system_content_B', 
                              'prompt_A', 'prompt_B', 'ground_truth_label', 
                              'model_output_A', 'model_output_B',
                              'better_cleaned_A', 'better_cleaned_B']]

In [None]:
count = llama70_A_vs_B['better_cleaned_A'].value_counts().get('none', 0)
print(count)

In [None]:
count = (llama70_A_vs_B['ground_truth_label'] != llama70_A_vs_B['better_cleaned_B']).sum()
print(count)


In [None]:
different_rows = llama70_A_vs_B[llama70_A_vs_B['ground_truth_label'] != llama70_A_vs_B['better_cleaned_A']]
different_rows

In [None]:
count = (llama70_A_vs_B['ground_truth_label'] != llama70_A_vs_B['cleaned_prediction_A']).sum()
print(count)


In [None]:
llama70_A_vs_B

## qualitative analysis II

In [None]:
llama8_B = updated_results[203]
llama8_S = updated_results[185]

In [None]:
merged_df = pd.merge(llama8_B, llama8_S, on=['model', 'clause', 'ground_truth_label'], suffixes=('_B', '_S'))

In [None]:
filtered_df = merged_df[merged_df['better_cleaned_B'] != merged_df['better_cleaned_S']]

In [None]:
llama8_B_vs_S = filtered_df[['model', 'clause' ,'experiment_name_B', 'experiment_name_S' ,'system_content_B', 'system_content_S', 
                              'prompt_B', 'prompt_S', 'ground_truth_label', 
                              'model_output_B', 'model_output_S',
                              'better_cleaned_B', 'better_cleaned_S']]

In [None]:
llama8_B_vs_S

In [None]:
count = (llama8_B['better_cleaned'] == 'red flag').sum()
print(count)


In [None]:
llama8_B

In [None]:
llama8_S

## qualitative analysis III

In [None]:
gpt4turbo_A = updated_results[191]
gpt4turbo_B = updated_results[200]

In [None]:
merged_df = pd.merge(gpt4turbo_A, gpt4turbo_B, on=['model', 'clause', 'ground_truth_label'], suffixes=('_A', '_B'))

In [None]:
filtered_df = merged_df[merged_df['better_cleaned_A'] != merged_df['better_cleaned_B']]

In [None]:
gpt4turbo_A_vs_B = filtered_df[['model', 'clause' ,'experiment_name_A', 'experiment_name_B' ,'system_content_A', 'system_content_B', 
                              'prompt_A', 'prompt_B', 'ground_truth_label', 
                              'model_output_A', 'model_output_B',
                              'better_cleaned_A', 'better_cleaned_B']]

In [None]:
gpt4turbo_A_vs_B

## analysis of continued pre-training improvements wrt different contract types


In [None]:
folder_path = 'test with further pre-trained results'

results_pretrain = []

pickle_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.pkl')])

for filename in pickle_files:
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'rb') as file:
        result = pickle.load(file)
        results_pretrain.append(result)


N = 200

updated_results_pretrain = []
for i in range(len(results_pretrain)):
    dataframe = pd.DataFrame(results_pretrain[i])
    experiment_name = dataframe['experiment_name'][0]
    if 'step by step' in experiment_name or 'appending' in experiment_name:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_first_new)
        dataframe['old_cleaned'] = dataframe['model_output'].apply(cleanup_result_first)
    elif 'alex' in experiment_name:
        dataframe['old_cleaned'] = dataframe['model_output'].apply(cleanup_result_first)
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_after_result)
        
    else:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_last_new)
        dataframe['old_cleaned'] = dataframe['model_output'].apply(cleanup_result_last)
    
    new_cleaned_results = dataframe['better_cleaned'].tolist()
    #print(new_cleaned_results)
    old_cleaned_results = dataframe['old_cleaned'].tolist()
    
    new_metrics = metrics_mine_dict(test_risks[:N], new_cleaned_results)
    old_metrics = metrics_mine_dict(test_risks[:N], old_cleaned_results)
    #newer_metrics = metrics_mine_dict_again(test_risks[:N], new_cleaned_results)
        
    
    dataframe['accuracy_new'] = new_metrics['Accuracy']
    dataframe['precision_new'] = new_metrics['Precision']
    dataframe['recall_new'] = new_metrics['Recall']
    dataframe['F1_new'] = new_metrics['F1']
    
    dataframe['accuracy_old'] = old_metrics['Accuracy']
    dataframe['precision_old'] = old_metrics['Precision']
    dataframe['recall_old'] = old_metrics['Recall']
    dataframe['F1_old'] = old_metrics['F1']
    
       
    updated_results_pretrain.append(dataframe)
    

In [None]:
updated_results_pretrain2 = []
for i in range(len(updated_results_pretrain)):
    if updated_results_pretrain[i]['model'][0] in ['llama3:8b', 'unsloth_model_1_epoch:latest']:
        updated_results_pretrain2.append(updated_results_pretrain[i])

In [None]:
for i in range(len(updated_results_pretrain2)):
    print(updated_results_pretrain2[i]['experiment_name'][0])
    

In [None]:
grouped = {}
for df in updated_results_pretrain2:
    key = (df['experiment_name'].iloc[0], df['model'].iloc[0])
    grouped[key] = df


In [None]:
contract_type_colors = {
    'implementation agreement': 'powderblue',
    'terms and conditions': 'lightcoral',
    'vendor agreement': 'lightblue',
    'non-disclosure agreement': 'lightgoldenrodyellow',
    'SaaS (software as a service)':'thistle'
}

In [None]:
experiment_name_dict = {'few-shot with 2 examples per class' : 'Simple Few-shot Prompting',
                       'zero-shot cot with legal template': 'Zero-shot with Legal Reasoning Template B',
                       'few shot with embeddings, per class, two examples per class, in system message': 'Few-shot Prompting with Embeddings',
                        'annollm with 3 examples per class, examples inside system message': 'AnnoLLM'
                       }

In [None]:
def final_exp_name(experiment_name):
    if experiment_name in list(experiment_name_dict.keys()):
        final_exp_name = experiment_name_dict[experiment_name]
    else:
        final_exp_name = experiment_name
    return final_exp_name

In [None]:
for experiment_name in set([key[0] for key in grouped.keys()]):
    df_A = grouped[(experiment_name, 'llama3:8b')] # base model
    df_B = grouped[(experiment_name, 'unsloth_model_1_epoch:latest')] # continued pre-trained model

    if df_B['F1_new'].iloc[0] > df_A['F1_new'].iloc[0]:
        print(df_B['experiment_name'][0])
        # Filter rows where Model B is correct and Model A is wrong
        correct_B_wrong_A = df_B[(df_B['better_cleaned'] == df_B['ground_truth_label']) & 
                                 (df_A['better_cleaned'] != df_A['ground_truth_label'])]
        
        # Count contract types for the filtered rows
        contract_type_counts = correct_B_wrong_A['contract_type'].value_counts()
        
        colors = [contract_type_colors[contract_type] for contract_type in contract_type_counts.index]


        # Create a pie chart
        plt.figure(figsize=(8, 6))
        plt.pie(contract_type_counts, labels=contract_type_counts.index, autopct='%1.1f%%', startangle=140, colors = colors, textprops={'fontsize': 7})
        plt.title(f"{final_exp_name(experiment_name)}")
        if experiment_name in list(experiment_name_dict.keys()):
            plt.savefig(f"pie chart for dists {final_exp_name(experiment_name)}.pdf", format = 'pdf')
        plt.show()


## analysis of similarity scores for few-shot with embeddings

In [None]:
df_ic_examples = pd.read_csv('ic_examples_dataset.csv')

In [None]:
def choose_in_context_examples_2_score(clause_to_be_tested, df, n_samples):
    def parse_embedding(embedding_str):
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    
    # split the df according to risk
    red_flag_df = df[df['ground_truth_label'] == 'red flag']
    potential_issue_df = df[df['ground_truth_label'] == 'potential issue']

    # list the embeddings 
    test_embeddings_red_flag = np.vstack(red_flag_df['embedding'].apply(parse_embedding).values)
    test_embeddings_potential_issue = np.vstack(potential_issue_df['embedding'].apply(parse_embedding).values)
    
    # get the embedding of the clause to be tested
    clause_embed_ = df_test.loc[df_test['clause'] == clause_to_be_tested, 'embedding']
    clause_embed = parse_embedding(clause_embed_.values[0])  # Convert to numpy array
    
    # find similarity scores
    similarities_red_flag = np.dot(test_embeddings_red_flag, clause_embed)
    similarities_potential_issue = np.dot(test_embeddings_potential_issue, clause_embed)

    top_similarities_red_flag = np.partition(similarities_red_flag, -n_samples)[-n_samples:]
    top_similarities_potential_issue = np.partition(similarities_potential_issue, -n_samples)[-n_samples:]

    top_similarities_red_flag = np.sort(top_similarities_red_flag)[::-1]
    top_similarities_potential_issue = np.sort(top_similarities_potential_issue)[::-1]
    
    return top_similarities_potential_issue, top_similarities_red_flag


In [None]:
def create_similarity_dataframe(test_clauses, df, n_samples, n_random):
    # Randomly select n_random clauses
    random_indices = np.random.choice(len(test_clauses), n_random, replace=False)
    selected_clauses = [test_clauses[i] for i in random_indices]
    
    # Initialize a list to hold the rows of the DataFrame
    rows = []
    
    for i, clause in enumerate(selected_clauses):
        top_potential_issue, top_red_flag = choose_in_context_examples_2_score(clause, df, n_samples)
        row = [random_indices[i]] + top_red_flag.tolist() + top_potential_issue.tolist()
        rows.append(row)
    
    # Create column names
    col_names = ['test_datum_index'] + \
                [f'ic_example_red_flag_{i+1}' for i in range(n_samples)] + \
                [f'ic_example_potential_issue_{i+1}' for i in range(n_samples)]
    
    # Create DataFrame
    similarity_df = pd.DataFrame(rows, columns=col_names)
    
    similarity_df.to_csv('similarity_dataframe.csv', index=False)

    return similarity_df

In [None]:
create_similarity_dataframe(test_clauses, df_ic_examples, 3, 20)

## qualitative analysis IV

In [None]:
template_a_8b = updated_results[194]
template_a_70b = updated_results[195]

In [None]:
template_a_8b.columns

In [None]:
merged_df_template_a = pd.merge(template_a_8b, template_a_70b, on=['experiment_name', 'clause', 'ground_truth_label'], suffixes=('_8', '_70'))

In [None]:
filtered_df_template_a = merged_df_template_a[merged_df_template_a['better_cleaned_8'] != merged_df_template_a['better_cleaned_70']]

In [None]:
template_a_8_vs_70 = filtered_df_template_a[['experiment_name', 'model_8', 'model_70', 'system_content_8', 'system_content_70', 
                              'prompt_8', 'prompt_70', 'ground_truth_label', 
                              'better_cleaned_8', 'better_cleaned_70']]

In [None]:
len(filtered_df_template_a)

In [None]:
count = template_a_70b['better_cleaned'].value_counts().get('none', 0)
print(count)

In [None]:
count = (template_a_8_vs_70['ground_truth_label'] != template_a_8_vs_70['better_cleaned_70']).sum()
print(count)

In [None]:
##################

In [None]:
lets_think_8b = updated_results[185]
lets_think_70b = updated_results[186]

In [None]:
merged_df_lets_think = pd.merge(lets_think_8b, lets_think_70b, on=['experiment_name', 'clause', 'ground_truth_label'], suffixes=('_8', '_70'))

In [None]:
filtered_df_lets_think = merged_df_lets_think[merged_df_lets_think['better_cleaned_8'] != merged_df_lets_think['better_cleaned_70']]

In [None]:
len(filtered_df_lets_think)

In [None]:
lets_think_8_vs_70 = filtered_df_lets_think[['experiment_name', 'clause', 'system_content_8', 'system_content_70', 
                              'prompt_8', 'prompt_70', 'ground_truth_label', 'model_output_8', 'model_output_70']]

In [None]:
lets_think_8_vs_70

In [None]:
count = lets_think_70b['better_cleaned'].value_counts().get('none', 0)
print(count)

In [None]:
count = (lets_think_8_vs_70['ground_truth_label'] != lets_think_8_vs_70['better_cleaned_70']).sum()
print(count)

In [None]:
count = lets_think_70b['better_cleaned'].value_counts().get('potential issue', 0)
print(count)

# analysis for continued pre-training

In [None]:
folder_path = 'test with further pre-trained results'

results = []

pickle_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.pkl')])

for filename in pickle_files:
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'rb') as file:
        result = pickle.load(file)
        results.append(result)

In [None]:
N = 200

updated_results = []
for i in range(len(results)):
    dataframe = pd.DataFrame(results[i])
    experiment_name = dataframe['experiment_name'][0]
    if 'step by step' in experiment_name or 'appending' in experiment_name:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_first_new)
    elif 'alex' in experiment_name:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_after_result)
        
    else:
        dataframe['better_cleaned'] = dataframe['model_output'].apply(cleanup_result_last_new)
    
    new_cleaned_results = dataframe['better_cleaned'].tolist()
    
    new_metrics = metrics_mine_dict(test_risks[:N], new_cleaned_results)
    
    dataframe['accuracy_new'] = new_metrics['Accuracy']
    dataframe['precision_new'] = new_metrics['Precision']
    dataframe['recall_new'] = new_metrics['Recall']
    dataframe['F1_new'] = new_metrics['F1']

    updated_results.append(dataframe)
    

In [None]:
#columns_to_consider = ['experiment_name', 'model', 'F1_new','total cost of this experiment','experiment ran for (seconds)']
columns_to_consider = ['experiment_name', 'model', 'F1_new','experiment ran for (seconds)']
unique_values = {col: [] for col in columns_to_consider}

# Iterate through each DataFrame and collect unique values
for df in updated_results:
    for col in columns_to_consider:
        unique_val = df[col].unique()
        if len(unique_val) == 1:  # Ensure it's a unique value column
            unique_values[col].append(unique_val[0])

# Convert the dictionary to a DataFrame
summary_continued = pd.DataFrame(unique_values)

In [None]:
summary_continued

In [None]:
average_f1_per_model = summary_continued.groupby('model')['F1_new'].mean().reset_index()
average_f1_per_model

## 1 epoch vs 3 epochs

In [None]:
models = ['unsloth_model_1_epoch:latest', 'unsloth_model_3_epochs:latest']
summary_epoch = summary_continued[summary_continued['model'].isin(models)]

In [None]:
pivot_table = summary_epoch.pivot(index='model', columns='experiment_name', values='F1_new')
pivot_table.reset_index(inplace=True)

pivot_table