In [None]:
#from transformers import TrainingArguments
from unsloth import is_bfloat16_supported, UnslothTrainer, UnslothTrainingArguments, FastLanguageModel
from datasets import load_dataset, DatasetDict, Dataset
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import os

In [None]:
max_seq_length = 2048 # Choose any. Unsloth support RoPE Scaling internally
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


In [None]:
def get_token_number(contract):
    """
    field: name of the column that contains the contract
    """
    tokens = tokenizer.encode(contract)
    no_of_tokens = len(tokens)
    return no_of_tokens

In [None]:
folder_path = 'source_documents'

In [None]:
file_contents_list = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the file and read its contents into a string
        with open(file_path, 'r') as file:
            file_contents = file.read()
        
        # Append the file contents to the list
        file_contents_list.append(file_contents)


In [None]:
token_counts = []
for i in range(len(file_contents_list)):
    token_counts.append(get_token_number(file_contents_list[i]))

In [None]:
print(f'total number of tokens in genie contracts data {sum(token_counts)}')

In [None]:
file_ids = []
file_contents = []

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the file and read its contents into a string
        with open(file_path, 'r') as file:
            file_contents.append(file.read())
        
        # Append the file name to the list
        file_ids.append(filename)

# Create a DataFrame from the lists
df = pd.DataFrame({'id': file_ids, 'contract': file_contents})

In [None]:
df['source'] = "Genie"

In [None]:
df['contract type'] = df['id'].str.extract(r'(^\w+)', expand=False)

In [None]:
contract_type_mapping = {
'NDA': 'non-disclosure agreement',
'T': 'terms and conditions agreement',
'Vendor': 'vendor agreement',
'SaaS': 'software as a service agreement',
'IA': 'implementation agreement'}

In [None]:
df['contract type'] = df['contract type'].map(contract_type_mapping)

In [None]:
# producing pie chart for genie data
def get_token_number(contract):
    """
    field: name of the column that contains the contract
    """
    tokens = tokenizer.encode(contract)
    no_of_tokens = len(tokens)
    return no_of_tokens

In [None]:
df['token count'] = df['contract'].apply(get_token_number)
contract_type_sums = df.groupby('contract type')['token count'].sum().to_dict()
import matplotlib.colors as mcolors

labels = list(contract_type_sums.keys())
sizes = list(contract_type_sums.values())

In [None]:
labels

In [None]:
#colormap = plt.cm.tab20b
#colors = colormap(np.linspace(0, 1, len(labels)))

contract_type_colors = {
    'implementation agreement': 'powderblue',
    'terms and conditions agreement': 'lightcoral',
    'vendor agreement': 'lightblue',
    'non-disclosure agreement': 'lightgoldenrodyellow',
    'software as a service agreement':'thistle'
}

colors = [contract_type_colors[label] for label in labels]


plt.figure(figsize=(10, 7))
#plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 15})
#plt.legend(labels, loc="best", fontsize=10)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add a title
plt.title('Token Count by Contract Type for Genie Dataset')

plt.savefig('pie_chart_genie.pdf', format='pdf')

# Display the pie chart
plt.show()

In [None]:
contract_type_sums

## more pie-charts

In [None]:
risk_type_dict = {'potential issue': 3968,
            'red flag': 388}

labels = list(risk_type_dict.keys())
sizes = list(risk_type_dict.values())

colormap = plt.cm.tab10
colors = colormap(np.linspace(0, 1, len(labels)))


plt.figure(figsize=(10, 7))
colors = ['darkseagreen', 'salmon']
#plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 15})
#plt.legend(labels, loc="best", fontsize=10)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add a title
plt.title('Risk Classes in the Genie Dataset')

plt.savefig('pie_chart_genie_risk.pdf', format='pdf')

# Display the pie chart
plt.show()

In [None]:
no_of_data_by_type = {'non-disclosure agreement': 1055,
 'software as a service agreement': 1739,
 'terms and conditions agreement': 897,
 'vendor agreement': 130,
 'implementation agreement': 535}


In [None]:
labels = list(no_of_data_by_type.keys())
sizes = list(no_of_data_by_type.values())

#colormap = plt.cm.tab10
#colors = colormap(np.linspace(0, 1, len(labels)))

contract_type_colors = {
    'implementation agreement': 'powderblue',
    'terms and conditions agreement': 'lightcoral',
    'vendor agreement': 'lightblue',
    'non-disclosure agreement': 'lightgoldenrodyellow',
    'software as a service agreement':'thistle'
}

colors = [contract_type_colors[label] for label in labels]


plt.figure(figsize=(10, 7))
#colors = ['darkseagreen', 'salmon']
#plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 13})
#plt.legend(labels, loc="best", fontsize=10)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add a title
plt.title('Distribution of Data Points in the Genie Dataset')

plt.savefig('pie_chart_genie_data_number.pdf', format='pdf')

# Display the pie chart
plt.show()

In [None]:
representing_dict = {'customer': 1486,
            'supplier' : 1137,
            'recipient': 877,
            'company': 535,
            'discloser':178,
            'licensee' : 143}

In [None]:
labels = list(representing_dict.keys())
sizes = list(representing_dict.values())

colormap = plt.cm.tab10
colors = colormap(np.linspace(0, 1, len(labels)))


plt.figure(figsize=(10, 7))
#colors = ['darkseagreen', 'salmon']
#plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.pie(sizes, labels=labels, colors=plt.cm.Set3.colors, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 14})
#plt.legend(labels, loc="best", fontsize=10)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add a title
plt.title('Distribution of Representing Sides in the Genie Dataset')

plt.savefig('pie_chart_genie_representing_sides.pdf', format='pdf')

# Display the pie chart
plt.show()

In [None]:
#doing the same thing but for test dataset

In [None]:
test_dataset = pd.read_csv("test_dataset.csv")

In [None]:
def plot_pie_chart(column, title):
    data = test_dataset[column].value_counts()
    plt.figure(figsize=(10,7))
    plt.pie(data, labels=data.index,colors=plt.cm.Set2.colors, autopct='%1.1f%%', startangle=140,textprops={'fontsize': 15})
    plt.title(title)
    plt.savefig(title, format='pdf')
    plt.show()

In [None]:
column = 'contract_type'
title = 'Distribution of Contract Types in the Test Dataset'

contract_type_colors = {
    'implementation agreement': 'powderblue',
    'terms and conditions': 'lightcoral',
    'vendor agreement': 'lightblue',
    'non-disclosure agreement': 'lightgoldenrodyellow',
    'SaaS (software as a service)':'thistle'
}


data = test_dataset[column].value_counts()
colors = [contract_type_colors[label] for label in data.index]

plt.figure(figsize=(10,7))
plt.pie(data, labels=data.index,colors=colors, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 15})
plt.title(title)
plt.savefig(f"{title}.pdf", format='pdf')
plt.show()

In [None]:
plot_pie_chart('representing', 'Distribution of Representing Sides in the Test Dataset.pdf')

In [None]:
plot_pie_chart('ground_truth_label', 'Risk Classes in the Test Dataset')
