### Dataset inconsistencies

In [26]:
import json
with open('../data/train_separate_questions_full.json') as f:
    train_dict = json.load(f)
with open('../data/CUADv1.json') as f:
    test_dict = json.load(f)


In [63]:
known_inconsistent = set(['VARIABLESEPARATEACCOUNT_04_30_2014-EX-13.C-UNCONDITIONAL CAPITAL MAINTENANCE AGREEMENT','PACIFICSYSTEMSCONTROLTECHNOLOGYINC_08_24_2000-EX-10.53-SPONSORSHIP AGREEMENT'])

In [64]:
from collections import defaultdict
contract_count = defaultdict(int)
for contract in train_dict['data']:
    if contract['title'] in known_inconsistent:
        print(contract['title'])
        print(len([(x['question'].split('"')[1].split('"')[0],x['answers'][0]["answer_start"]) for x in contract['paragraphs'][0]['qas'] if x['answers']]))
        print([(x['question'].split('"')[1].split('"')[0],x['answers'][0]["answer_start"]) for x in contract['paragraphs'][0]['qas'] if x['answers']])
    

VARIABLESEPARATEACCOUNT_04_30_2014-EX-13.C-UNCONDITIONAL CAPITAL MAINTENANCE AGREEMENT
14
[('Document Name', 175), ('Parties', 336), ('Parties', 449), ('Parties', 438), ('Parties', 481), ('Agreement Date', 302), ('Effective Date', 302), ('Expiration Date', 7938), ('Notice Period To Terminate Renewal', 8377), ('Notice Period To Terminate Renewal', 8045), ('Governing Law', 13131), ('Minimum Commitment', 15110), ('Minimum Commitment', 1280), ('Minimum Commitment', 3516)]
PACIFICSYSTEMSCONTROLTECHNOLOGYINC_08_24_2000-EX-10.53-SPONSORSHIP AGREEMENT
9
[('Document Name', 33), ('Parties', 4033), ('Parties', 541), ('Parties', 107), ('Parties', 334), ('Parties', 139), ('Agreement Date', 468), ('Governing Law', 3103), ('Anti-Assignment', 2237)]


In [65]:

for contract in test_dict['data']:
    if contract['title'] in known_inconsistent:
        print(contract['title'])
        print(len([(x['question'].split('"')[1].split('"')[0],x['answers'][0]["answer_start"]) for x in contract['paragraphs'][0]['qas'] if x['answers']]))
        print([(x['question'].split('"')[1].split('"')[0],x['answers'][0]["answer_start"]) for x in contract['paragraphs'][0]['qas'] if x['answers']])

VARIABLESEPARATEACCOUNT_04_30_2014-EX-13.C-UNCONDITIONAL CAPITAL MAINTENANCE AGREEMENT
8
[('Document Name', 175), ('Parties', 449), ('Agreement Date', 302), ('Effective Date', 302), ('Expiration Date', 7938), ('Notice Period To Terminate Renewal', 8045), ('Governing Law', 13131), ('Minimum Commitment', 3516)]
PACIFICSYSTEMSCONTROLTECHNOLOGYINC_08_24_2000-EX-10.53-SPONSORSHIP AGREEMENT
5
[('Document Name', 33), ('Parties', 107), ('Agreement Date', 468), ('Governing Law', 3103), ('Anti-Assignment', 2237)]


very clear that it's not the same marking across the datasets... We use the train_seperate_questions as this is the one their source code is refeering to 

## Improving the baseline
This notebooks is for extracting the non seperate train file from the CUAD dataset

In [72]:
import json
from copy import deepcopy

# load json file
with open('../data/train_separate_questions_full.json') as json_file:
    data = json.load(json_file)

In [102]:
contract_data = {}
for contract in data['data']:
    for para in contract['paragraphs']:
        qas = {}
        for qa in para['qas']:
            qa_title = '_'.join(qa['id'].split('_')[:-1])
            if qa_title not in qas:
                qas[qa_title] = deepcopy(qa)
                # change id
                qas[qa_title]['id'] = qa_title

            else:
                if qa['answers']:
                    qas[qa_title]['answers'].append(qa['answers'][0])
                    qas[qa_title]['is_impossible'] = qa['is_impossible']
        contract_data[contract['title']] = qas

In [103]:
from copy import deepcopy
data_2 = deepcopy(data)

In [104]:
for contract in data_2['data']:
    for para in contract['paragraphs']:
        para['qas']= list(contract_data[contract['title']].values())

Validate new dataset has same number of answers

In [111]:
ans_1_count =0
ans_1_set = set()
for contract in data['data']:
    for para in contract['paragraphs']:
        for qa in para['qas']:
            for ans in qa['answers']:
                ans_1_count += 1
                ans_1_set.add(ans['answer_start'])
            

In [112]:
ans_2_count =0
ans_2_set = set()
for contract in data_2['data']:
    for para in contract['paragraphs']:
        for qa in para['qas']:
            for ans in qa['answers']:
                ans_2_count += 1
                ans_2_set.add(ans['answer_start'])

In [113]:
# check if the two sets are the same
print(ans_1_count)
print(ans_2_count)
print(ans_1_set==ans_2_set)

22625
22625
True


In [115]:
# dump
with open('../data/train.json', 'w') as outfile:
    json.dump(data_2, outfile,indent=4, sort_keys=True)

## How many datapoints in the different dataset versions

In [24]:
import torch
import pandas as pd 
import numpy as np
import plotly.express as px


In [29]:
# Original dataset features
features_org = torch.load('../cuad_training/cuad_lightning/out/dataset-name_CUAD_model-type_roberta_only-first-ans_True_doc-stride_256_dataset-type_eval_predict-file-version_test_features')
# New dataset features
features_org = torch.load('../cuad_training/cuad_lightning/out/dataset-name_CUAD_model-type_roberta_only-first-ans_False_doc-stride_256_dataset-type_eval_predict-file-version_test_features')

In [57]:
from transformers.data.processors.squad import SquadFeatures
from collections import defaultdict

data = []
for feature in features_org:
    contract,question = feature.qas_id.split('__')
    
    data.append([contract, question, feature.is_impossible])

In [58]:
df = pd.DataFrame(data, columns=['contract', 'question','is_impossible'])

In [61]:
df_g = df.groupby(['question','is_impossible']).count()
df_g = df_g.reset_index()
df_g

Unnamed: 0,question,is_impossible,contract
0,Document Name,False,1
1,Document Name,True,5
2,Exclusivity,False,1
3,Exclusivity,True,6
4,License Grant,False,2
5,License Grant,True,5
6,Non-Transferable License,True,7
7,Unlimited/All-You-Can-Eat-License,False,2
8,Unlimited/All-You-Can-Eat-License,True,5


In [62]:
px.bar(df_g, x='question', y='contract', color='is_impossible', title='Number of contracts per question')