# Automating Contract Review With Transformer Models

##### Task 1: Import the Libraries

In [2]:
import os
import json 
import torch
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

##### Task 2: Generate Dataset Files

In [44]:
def convert_json_to_csv(json_dir, filename, count):
    with open(json_dir, 'r') as f:
        data = json.load(f)

    # Extract the relevant data and create a DataFrame
    text_list, hypothesis_list, labels_list = [], [], []

    counter = 0  # Counter variable to track the number of appended elements

    for doc in data['documents']:
        annotation_sets = doc.get('annotation_sets', [])
        for annotation_set in annotation_sets:
            annotations = annotation_set.get('annotations', [])
            for annotation in annotations:
                if counter >= count:
                    break  # Break the loop after appending 50 elements

                if isinstance(annotations[annotation], dict):
                    if 'choice' in annotations[annotation]:
                        labels_list.append(annotations[annotation]['choice'])
                        text_list.append(doc['text'])
                        hypothesis_list.append(data['labels'][annotation]['hypothesis'])
                        counter += 1  # Increment the counter

    if len(text_list) == len(hypothesis_list) and len(hypothesis_list) == len(labels_list):
        df = pd.DataFrame({'text': text_list, 'hypothesis': hypothesis_list, 'labels': labels_list})
        # Save the DataFrame to a CSV file
        df.to_csv(filename, index=False)

In [45]:
# Call convert_json_to_csv() function for training, validation and testing.
convert_json_to_csv('/usercode/Contract-nli Dataset/train.json', '/usercode/Contract-nli Dataset/train.csv', 35)
convert_json_to_csv('/usercode/Contract-nli Dataset/valid.json', '/usercode/Contract-nli Dataset/valid.csv', 10)
convert_json_to_csv('/usercode/Contract-nli Dataset/test.json', '/usercode/Contract-nli Dataset/test.csv', 5)

##### Task 3: Calculate Dataset Statistics

In [49]:
# 1. Load the dataset
train_df = pd.read_csv('/usercode/Contract-nli Dataset/train.csv')
valid_df = pd.read_csv('/usercode/Contract-nli Dataset/valid.csv')
test_df = pd.read_csv('/usercode/Contract-nli Dataset/test.csv')



In [50]:
# 2. Display the first and last 5 rows of the training dataset
display(train_df.head(5))
display(train_df.tail(5))


Unnamed: 0,text,hypothesis,labels
0,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,NotMentioned
1,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall destroy or return some C...,Entailment
2,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Agreement shall not grant Receiving Party any ...,Entailment
3,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not disclose the fact th...,Entailment
4,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information shall only include te...,NotMentioned


Unnamed: 0,text,hypothesis,labels
30,NON-DISCLOSURE AGREEMENT AND TERMS OF PARTICIP...,Receiving Party shall notify Disclosing Party ...,Entailment
31,NON-DISCLOSURE AGREEMENT AND TERMS OF PARTICIP...,Receiving Party may acquire information simila...,Entailment
32,NON-DISCLOSURE AGREEMENT AND TERMS OF PARTICIP...,Receiving Party may share some Confidential In...,Contradiction
33,NON-DISCLOSURE AGREEMENT AND TERMS OF PARTICIP...,Receiving Party shall not use any Confidential...,Entailment
34,Mutual Non-Disclosure and Use of Information A...,Receiving Party shall not reverse engineer any...,NotMentioned


In [51]:
# 3. Display the dimension of the training data
train_df.shape

(35, 3)

In [52]:
# 4. Display a summary of the training DataFrame
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        35 non-null     object
 1   hypothesis  35 non-null     object
 2   labels      35 non-null     object
dtypes: object(3)
memory usage: 968.0+ bytes


In [53]:
# 5. Print the statistical summary of the training, validation, and testing DataFrames
display(train_df.describe())
display(valid_df.describe())
display(test_df.describe())

Unnamed: 0,text,hypothesis,labels
count,35,35,35
unique,3,17,3
top,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,Entailment
freq,17,3,18


Unnamed: 0,text,hypothesis,labels
count,10,10,10
unique,1,10,2
top,OISAIR PROJECT\nTWO-WAY CONFIDENTIALITY AND NO...,Receiving Party shall not reverse engineer any...,Entailment
freq,10,1,9


Unnamed: 0,text,hypothesis,labels
count,5,5,5
unique,1,5,3
top,NON-DISCLOSURE AGREEMENT\nRequired under JEA's...,Receiving Party shall not reverse engineer any...,NotMentioned
freq,5,1,2


##### Task 4: Create Visualization Function for Features

In [5]:
# Write visualize_features() function


In [None]:
# Call visualize_features() function for training, validation and testing


##### Task 5: Create Visualizations Function for Labels

In [6]:
# Write visualize_label() function


In [None]:
# Call visualize_label() function for training, validation and testing.


##### Task 6: Load the Tokenizer and Model

In [7]:
# ALBERT tokenizer and model


In [None]:
# DistilBERT tokenizer and model


##### Task 7: Encode the Features

In [None]:
# Write tokenize_data() function


In [None]:
# Call tokenize_data() function for training, validation and testing.
## ALBERT model

## DistilBERT model


##### Task 8: Encode the Label

##### Task 9: Prepare Dataset for the Model

In [None]:
# Write ContractNLIDataset() class. 


In [None]:
# Create the objects from ContractNLIDataset() class for training, validation and testing.
## ALBERT model

## DistilBERT model


##### Task 10: Fine-Tune the Selected Models

In [None]:
# Write compute_metrics() function. 


In [None]:
# configure the training setting


In [None]:
# Write train_fn() function


In [None]:
# Call train_fn() function
## ALBERT model

## DistilBERT model


##### Task 11: Test the Selected Models

In [None]:
# Write predict_and_save_results() function


In [None]:
# Call predict_and_save_results() function.
## ALBERT model

## DistilBERT model


##### Task 12: Identify Incorrect Predictions

In [None]:
## ALBERT model

## DistilBERT model


##### Task 13: Categorize the Errors

In [None]:
## ALBERT model

## DistilBERT model


##### Task 14: Visualize Error Categories

In [None]:
## ALBERT model


In [None]:
## DistilBERT model