# Dataset Preparation

In [4]:
import os
import re
import json
from sklearn.model_selection import train_test_split


def generate_data(dataset):
    all_labels = set()
    final_data = {}

    for data in dataset:
        text = data['data']['text']
        annotations = data['annotations'][0]['result']
        pattern = r'[ ]+'
        labels_encoded = 0
        annotations = sorted(annotations, key=lambda x: len(x['value']['text']), reverse=True)


        for annotation in annotations:
            label_name = annotation['value']['labels'][0]
            label_text = annotation['value']['text']
            if label_text not in text:
                print('ERROR: label not found in text', label_text)

            # Encode labels itself
            label_tokens = ['I_' + label_name] * len(re.split(pattern, label_text))
            label_tokens[0] = 'B_' + label_name
            
            # Replace the target label with encoded label
            new_text = text.replace(label_text, '@~' + ' '.join(label_tokens) + '@~', 1)
            if new_text != text:
                labels_encoded += 1
                text = new_text
        
        # Clean the labels
        labels = re.split(pattern, text)
        for i in range(len(labels)):
            if 'B_' in labels[i] or 'I_' in labels[i]:
                bio_label = 'B_' if 'B_' in labels[i] else 'I_'
                start = labels[i].index(bio_label)
                end = start + 2
                while end < len(labels[i]) and (labels[i][end].isupper() or labels[i][end] == '_'):
                    end += 1
                labels[i] = labels[i][start:end]
            else:
                labels[i] = 'O'
                
        # Check whether all labels given are encoded or not
        if labels_encoded != len(annotations):
            print('ERROR: labels not encoded correctly', labels_encoded, len(annotations))
            print('Some labels missed in encoding')
        
        # Check whether the length of labels and data-tokens are matching or not
        if len(labels) != len(re.split(pattern, data['data']['text'])):
            print('ERROR: labels not encoded correctly', len(labels), len(re.split(pattern, data['data']['text'])))
            print('Length of labels and text-tokens are not matching')
        
        final_data[data['id']] = {'text': data['data']['text'], 'labels': labels}
        all_labels.update(labels)
        
    # there should be only 27 labels (13*2 + 1)
    print(all_labels)
    print(len(all_labels))
    return final_data



train = json.load(open('NER_TRAIN_JUDGEMENT.json', 'r')) 
train_split, val_split = train_test_split(train, test_size=0.15, random_state=42)   
test_split = json.load(open('NER_TEST_JUDGEMENT.json', 'r'))

json.dump(train_split, open('NER_train_split.json', 'w'))
json.dump(val_split, open('NER_val_split.json', 'w'))
json.dump(test_split, open('NER_test_split.json', 'w'))

train_data = generate_data(train_split)
val_data = generate_data(val_split)
test_data = generate_data(test_split)

json.dump(train_data, open('NER_train.json', 'w'))
json.dump(val_data, open('NER_val.json', 'w'))
json.dump(test_data, open('NER_test.json', 'w'))

{'B_PRECEDENT', 'B_PETITIONER', 'B_JUDGE', 'B_OTHER_PERSON', 'I_PRECEDENT', 'B_GPE', 'I_PROVISION', 'O', 'B_STATUTE', 'B_RESPONDENT', 'B_COURT', 'I_RESPONDENT', 'B_DATE', 'I_OTHER_PERSON', 'I_DATE', 'I_GPE', 'I_COURT', 'I_CASE_NUMBER', 'B_WITNESS', 'B_PROVISION', 'B_ORG', 'B_CASE_NUMBER', 'I_STATUTE', 'I_ORG', 'I_WITNESS', 'I_JUDGE', 'I_PETITIONER'}
27
{'B_PRECEDENT', 'B_JUDGE', 'B_PETITIONER', 'B_OTHER_PERSON', 'I_PRECEDENT', 'B_GPE', 'I_PROVISION', 'O', 'B_STATUTE', 'B_RESPONDENT', 'B_COURT', 'I_RESPONDENT', 'B_DATE', 'I_OTHER_PERSON', 'I_DATE', 'I_GPE', 'I_COURT', 'I_CASE_NUMBER', 'B_WITNESS', 'B_PROVISION', 'B_ORG', 'B_CASE_NUMBER', 'I_STATUTE', 'I_ORG', 'I_WITNESS', 'I_JUDGE', 'I_PETITIONER'}
27
{'B_PRECEDENT', 'B_JUDGE', 'B_PETITIONER', 'I_PRECEDENT', 'B_OTHER_PERSON', 'B_GPE', 'I_PROVISION', 'O', 'B_STATUTE', 'B_RESPONDENT', 'B_COURT', 'I_RESPONDENT', 'B_DATE', 'I_OTHER_PERSON', 'I_DATE', 'I_GPE', 'I_COURT', 'I_CASE_NUMBER', 'B_WITNESS', 'B_PROVISION', 'B_ORG', 'B_CASE_NUMBER', 

In [5]:
import json

train_data = json.load(open('NER_train.json', 'r'))
val_data = json.load(open('NER_val.json', 'r'))
test_data = json.load(open('NER_test.json', 'r'))

train_split_data = json.load(open('NER_train_split.json', 'r'))
val_split_data = json.load(open('NER_val_split.json', 'r'))
test_split_data = json.load(open('NER_test_split.json', 'r'))


for data in val_split_data:
    case_id = data['id']
    labels = val_data[case_id]['labels']
    count_B = sum([1 for label in labels if 'B_' in label])
    annotations = data['annotations'][0]['result']
    if count_B != len(annotations):
        print('ERROR: labels not encoded correctly', count_B, len(annotations))
        print('Some labels missed in encoding', case_id)
    

ERROR: labels not encoded correctly 3 4
Some labels missed in encoding 8b001e58548947a78de2312ec219f955
ERROR: labels not encoded correctly 3 4
Some labels missed in encoding c2ad532ec4154454886727b5cc820f6a
