In [10]:
import json
from sklearn.model_selection import train_test_split

with open('./NER_TRAIN_JUDGEMENT.json', 'r') as f:
    train_data = json.load(f)

with open('./NER_TEST_JUDGEMENT.json', 'r') as f:
    test_data = json.load(f)

strat_labels = []
for item in train_data:
    t = []
    for res in item['annotations'][0]['result']:
        t.extend(res['value']['labels'])
    if t:
        most_common = max(set(t), key=t.count)
    else:
        most_common = 'None' 
    
    strat_labels.append(most_common)

train_data, val_data = train_test_split(train_data, test_size=0.15, random_state=42, stratify=strat_labels)

def bio_labelling(text, annotations):
    words = text.split()  
    labels = ['O'] * len(words)  

    for anno in annotations:
        start = anno['value']['start']
        label = anno['value']['labels'][0]
        entity_words = anno['value']['text'].split()

        start_i = None
        end_i = None
        for i, word in enumerate(words):
            if text.find(word, start) == start:
                start_i = i
                end_i = i + len(entity_words) - 1
                break

        if start_i is not None and end_i is not None:
            if start_i < len(labels) and end_i < len(labels):
                labels[start_i] = 'B_' + label
                for i in range(start_i + 1, end_i + 1):
                    if i < len(labels):
                        labels[i] = 'I_' + label

    return labels

def preparation(data):
  data_ = {}
  for items in data:
    id = items['id']
    text = items['data']['text']
    anno = items['annotations'][0]['result']
    labels = bio_labelling(text, anno)
    data_[id] = {'text': text, 'labels': labels}
  return data_

train_proc = preparation(train_data)
test_proc = preparation(test_data)
val_proc = preparation(val_data)

with open('./NER_train.json', 'w') as f:
    json.dump(train_proc, f, indent=4)

with open('./NER_val.json', 'w') as f:
    json.dump(val_proc, f, indent=4)

with open('./NER_test.json', 'w') as f:
    json.dump(test_proc, f, indent=4)