## Import library

In [2]:
import os
import numpy as np
import evaluate
import pickle
import random
import tqdm
import matplotlib.pyplot as plt
from decimal import Decimal, getcontext
getcontext().prec = 64
import warnings
warnings.filterwarnings('ignore')
from scipy.special import rel_entr
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from accelerate import Accelerator
from tqdm.auto import tqdm
from datasets import *

## Create NER labels

In [3]:
entity = ['PATIENT'   , 'DOCTOR'       , 'USERNAME'  ,
          'PROFESSION',
          'ROOM'      , 'DEPARTMENT'   , 'HOSPITAL'  , 'ORGANIZATION', 'STREET' , 'CITY'    , 'STATE' , 'COUNTRY', 'ZIP'  , 'LOCATION-OTHER', 
          'AGE'       , 
          'DATE'      , 'TIME'         , 'DURATION'  , 'SET'         , 
          'PHONE'     , 'FAX'          , 'EMAIL'     , 'URL'         , 'IPADDR' , 
          'SSN'       , 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT'     , 'LICENSE', 'VECHICLE', 'DEVICE', 'BIOID'  , 'IDNUM']
label_names = ['OTHER']
entity_names = []
entity_count = [0] * len(entity)

for s in entity:
    label_names.append(f'B-{s}')
    label_names.append(f'I-{s}')
    entity_names.append(s)
    
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
org_id2label = {i: label for i, label in enumerate(entity_names)}
org_label2id = {v: k for k, v in org_id2label.items()}

## Create dataset

In [4]:
# def Spilt2Words(name, f, fa):
#     tok = []
#     ner = []
#     lidx = 0
#     ridx = 0
#     while True:
#         # remove last '\n'
#         ans_info = fa.readline()[:-1].split('\t')
#         # remove normalized DATE/TIME
#         if (ans_info[1] == 'DATE' or ans_info[1] == 'TIME'): ans_info = ans_info[:-1]
            
#         if (ans_info[1] != 'OTHER'): entity_count[org_label2id[ans_info[1]]] += 1
            
#         ent_lidx, ent_ridx = int(ans_info[2]), int(ans_info[3])

#         # find next ans_info
#         while True:
#             word = ''
#             # find next word lidx
#             while True:
#                 nxt_char = f.read(1)
#                 if (nxt_char == ' ' or nxt_char == '\n' or nxt_char == '\t'): 
#                     lidx += 1
#                 else: 
#                     word += nxt_char
#                     break
#             ridx = lidx
#             # find next word ridx
#             while True:
#                 char_pos = f.tell()
#                 nxt_char = f.read(1)
#                 if (nxt_char == ' ' or nxt_char == '\n' or nxt_char == '\t' or ridx + 1 == ent_ridx):
#                     ridx += 1
#                     f.seek(char_pos)
#                     break
#                 else:
#                     ridx += 1
#                     word += nxt_char
                
#             line_end = 0
#             # remove '\n' in last word
#             if (word[:-1] == '\n'): 
#                 line_end = 1
#                 word = word[:-1]
#             # truncate beginning of the word if it is an entity word
#             while (lidx < ent_lidx and ridx > ent_lidx and ridx <= ent_ridx):
#                 lidx += 1
#                 word = word[1:]
                
#             tok.append(word)
            
#             if (lidx < ent_lidx):
#                 ner.append(label2id['OTHER'])
#             elif (lidx == ent_lidx):
#                 ner.append(label2id['B-' + ans_info[1]])
#             elif (ridx <= ent_ridx):
#                 ner.append(label2id['I-' + ans_info[1]])
            
#             lidx = ridx
            
#             if (ridx == ent_ridx): # found the last word of entity, move to next answer info
#                 break
        
#         info_pos = fa.tell()
#         nxt_info = fa.readline()[:-1].split('\t')
#         fa.seek(info_pos)
#         # nxt_info is in next file
#         if (nxt_info[0] != name): 
#             break
#         # nxt_info is in current file but has overlap in current info
#         if (int(nxt_info[3]) <= ent_ridx):
#             nxt_info = fa.readline()
            
#     return tok, ner

In [5]:
# def Segmentation(ds_id, ds_tok, ds_ner, id, tok, ner, l):
#     while (len(ner) >= l):
#         ridx = l
#         k = random.randint(0, 1)
#         while (ridx > 0 and ridx < len(ner) and id2label[ner[ridx]] != 'OTHER'):
#             if (k): 
#                 ridx += 1
#             else:
#                 ridx -= 1
#         if (ridx == 0):
#             ridx = len(ner)
#         elif (ridx < len(ner)):
#             ridx += 1
#         ds_id.append(id)
#         ds_tok.append(tok[:ridx])
#         ds_ner.append(ner[:ridx])
#         tok = tok[ridx:]
#         ner = ner[ridx:]
#     if (len(ner) > 0):
#         ds_id.append(id)
#         ds_tok.append(tok)
#         ds_ner.append(ner)
#     return

In [6]:
# ds_dict = {'id':[], 'tokens':[], 'ner_tags':[]}

# fnames = [f for f in os.listdir('./First_Phase_Release(Correction)/First_Phase_Text_Dataset')]
# fnames.sort()

# max_word_length = 80
# fa = open('./First_Phase_Release(Correction)/answer.txt', 'r')
# for fname in tqdm(fnames):
#     f = open(f'./First_Phase_Release(Correction)/First_Phase_Text_Dataset/{fname}', 'r')
#     tok, ner = Spilt2Words(fname[:-4], f, fa)
#     if (max_word_length > 0):
#         Segmentation(ds_dict['id'], ds_dict['tokens'], ds_dict['ner_tags'], fname[:-4], tok, ner, max_word_length)
#     else:
#         ds_dict['id'].append(fname[:-4])
#         ds_dict['tokens'].append(tok)
#         ds_dict['ner_tags'].append(ner)
#     f.close()

In [7]:
# fnames = [f for f in os.listdir('./Second_Phase_Dataset/Second_Phase_Text_Dataset')]
# fnames.sort()

# max_word_length = 80
# fa = open('./Second_Phase_Dataset/answer.txt', 'r')
# for fname in tqdm(fnames):
#     f = open(f'./Second_Phase_Dataset/Second_Phase_Text_Dataset/{fname}', 'r')
#     tok, ner = Spilt2Words(fname[:-4], f, fa)
#     if (max_word_length > 0):
#         Segmentation(ds_dict['id'], ds_dict['tokens'], ds_dict['ner_tags'], fname[:-4], tok, ner, max_word_length)
#     else:
#         ds_dict['id'].append(fname[:-4])
#         ds_dict['tokens'].append(tok)
#         ds_dict['ner_tags'].append(ner)
#     f.close()

## Spilt train & dev data

In [8]:
# def CountSim(train, valid):
#     tcnt = [0] * len(entity)
#     vcnt = [0] * len(entity)
#     for tdata in train:
#         for t in tdata:
#             if (t != 0 and id2label[t][0] != 'I'): tcnt[org_label2id[id2label[t][2:]]] += 1
#     for vdata in valid:
#         for v in vdata:
#             if (v != 0 and id2label[v][0] != 'I'): vcnt[org_label2id[id2label[v][2:]]] += 1
#     tsum = sum(tcnt)
#     vsum = sum(vcnt)
#     dist = 0
#     for i in range(len(entity)):
#         tcnt[i] = tcnt[i]/tsum
#         vcnt[i] = vcnt[i]/vsum
#         dist += abs(tcnt[i] - vcnt[i]) * abs(tcnt[i] - vcnt[i])
#     return tcnt, vcnt, dist

In [9]:
# best_ds_train_valid = Dataset.from_dict(ds_dict).train_test_split(train_size=0.9)
# best_tpor = [0] * len(entity)
# best_vpor = [0] * len(entity)
# best_dist = 1
# upper_bound = 2e-5
# try_step = 1000
# while (best_dist > upper_bound):
#     for i in tqdm(range(try_step)):
#         cur_ds_train_valid = Dataset.from_dict(ds_dict).train_test_split(train_size=0.8)
#         cur_tpor, cur_vpor, cur_dist = CountSim(cur_ds_train_valid['train']['ner_tags'], cur_ds_train_valid['test']['ner_tags'])
#         if (cur_dist < best_dist):
#             best_ds_train_valid = cur_ds_train_valid
#             best_tpor = cur_tpor
#             best_vpor = cur_vpor
#             best_dist = cur_dist
#             print(f'New smallest dist = {best_dist}')
    
# x = np.arange(len(entity_names))
# width = 0.4
# plt.figure(figsize=(12.8, 4.8))
# plt.bar(x, best_tpor, width, color='green', label='Train')
# plt.bar(x + width, best_vpor, width, color='blue', label='Dev')
# plt.xticks(x + width / 2, entity_names, rotation='vertical')
# plt.ylabel('Porpotion')
# plt.title('TrainDev distribution')
# plt.legend()
# plt.savefig('TrainDev distribution')
# plt.show()

In [10]:
# raw_ds = DatasetDict({'train': best_ds_train_valid['train'],
#                   'validation': best_ds_train_valid['test']})

In [11]:
raw_ds = load_from_disk("./ner_dataset/")

In [12]:
raw_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9575
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2394
    })
})

## Tokenize data

In [14]:
model_name = "hfl/english-pert-large"

model_checkpoint = model_name
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], truncation=True, is_split_into_words=True
    )
    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [19]:
tokenized_datasets = raw_ds.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_ds['train'].column_names,
)

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9575
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2394
    })
})

In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels']

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    3,    4,    4,    4,
            0,    0,    0,    0,    0,    3,    4,    4,    0,    0,    0,    0,
            0,    0,   33,   34,   34,   34,   34,   34,   34,   34,   34,   34,
            0,    0,    0,    0,    3,    4,    3,    0,    0,   31,   32,   32,
           32,   32,    0,    0,    0,    0,    0,    0,    0,    3,    4,    4,
            4,    4, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

## Training config

In [23]:
output_dir = './models/ner/'
#repo = Repository(output_dir, clone_from=repo_name)

In [24]:
metric = evaluate.load('seqeval')

In [25]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        'precision': all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy'],
    }

In [26]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=67, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/english-pert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=10,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.train()

In [28]:
#labels = raw_ds['train'][0]['ner_tags']
#labels = [label_names[i] for i in labels]
#labels

In [29]:
#predictions = labels.copy()
#predictions[2] = 'OTHER'
#metric.compute(predictions=[predictions], references=[labels])

In [30]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

train_dataloader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

eval_dataloader = DataLoader(
    tokenized_datasets['validation'], collate_fn=data_collator, batch_size=8
)

optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# accelerator = Accelerator()
accelerator = Accelerator(cpu=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/english-pert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#model_name = 'bert-finetuned-ner-accelerate'
#repo_name = get_full_repo_name(model_name)
#repo_name

In [32]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

## Training

In [33]:
progress_bar = tqdm(range(num_training_steps))
f1_score = []

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch['labels']

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    f1_score.append(results['overall_f1'])
    print(
        f'epoch {epoch}:',
        {
            key: results[f'overall_{key}']
            for key in ['precision', 'recall', 'f1', 'accuracy']
        },
    )

    #Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
        #    commit_message=f'Training in progress epoch {epoch}', blocking=False
        #)

  0%|          | 0/11970 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print(results)
# {'AGE': {'precision': 0.9565217391304348, 'recall': 0.88, 'f1': 0.9166666666666666, 'number': 25}, 'CITY': {'precision': 0.9893048128342246, 'recall': 0.9840425531914894, 'f1': 0.9866666666666667, 'number': 188}, 'DATE': {'precision': 0.9914984059511158, 'recall': 0.9978609625668449, 'f1': 0.9946695095948828, 'number': 935}, 'DEPARTMENT': {'precision': 0.9478672985781991, 'recall': 0.9302325581395349, 'f1': 0.9389671361502349, 'number': 215}, 'DOCTOR': {'precision': 0.9845201238390093, 'recall': 0.9845201238390093, 'f1': 0.9845201238390093, 'number': 1292}, 'DURATION': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}, 'HOSPITAL': {'precision': 0.9827089337175793, 'recall': 0.9798850574712644, 'f1': 0.981294964028777, 'number': 348}, 'IDNUM': {'precision': 0.9876373626373627, 'recall': 0.9930939226519337, 'f1': 0.9903581267217632, 'number': 724}, 'LOCATION-OTHER': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'MEDICALRECORD': {'precision': 0.9971830985915493, 'recall': 0.9943820224719101, 'f1': 0.9957805907172996, 'number': 356}, 'ORGANIZATION': {'precision': 0.9090909090909091, 'recall': 0.8333333333333334, 'f1': 0.8695652173913043, 'number': 24}, 'PATIENT': {'precision': 0.9831460674157303, 'recall': 0.9915014164305949, 'f1': 0.9873060648801127, 'number': 353}, 'PHONE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'SET': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'STATE': {'precision': 0.9942528735632183, 'recall': 0.9885714285714285, 'f1': 0.991404011461318, 'number': 175}, 'STREET': {'precision': 0.9888888888888889, 'recall': 0.978021978021978, 'f1': 0.9834254143646408, 'number': 182}, 'TIME': {'precision': 0.9772727272727273, 'recall': 0.9641255605381166, 'f1': 0.9706546275395034, 'number': 223}, 'ZIP': {'precision': 1.0, 'recall': 0.9731182795698925, 'f1': 0.9863760217983651, 'number': 186}, 'overall_precision': 0.9833652007648184, 'overall_recall': 0.9835532606616944, 'overall_f1': 0.9834592217229181, 'overall_accuracy': 0.999238331528254}

{'AGE': {'precision': 0.9565217391304348, 'recall': 0.88, 'f1': 0.9166666666666666, 'number': 25}, 'CITY': {'precision': 0.9893048128342246, 'recall': 0.9840425531914894, 'f1': 0.9866666666666667, 'number': 188}, 'DATE': {'precision': 0.9914984059511158, 'recall': 0.9978609625668449, 'f1': 0.9946695095948828, 'number': 935}, 'DEPARTMENT': {'precision': 0.9478672985781991, 'recall': 0.9302325581395349, 'f1': 0.9389671361502349, 'number': 215}, 'DOCTOR': {'precision': 0.9845201238390093, 'recall': 0.9845201238390093, 'f1': 0.9845201238390093, 'number': 1292}, 'DURATION': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}, 'HOSPITAL': {'precision': 0.9827089337175793, 'recall': 0.9798850574712644, 'f1': 0.981294964028777, 'number': 348}, 'IDNUM': {'precision': 0.9876373626373627, 'recall': 0.9930939226519337, 'f1': 0.9903581267217632, 'number': 724}, 'LOCATION-OTHER': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'MEDICALRECORD': {'precision': 0.9971830985915493, 'reca

## Draw f1 score

In [250]:
f1_dict = {'AGE': {'precision': 0.9565217391304348, 'recall': 0.88, 'f1': 0.9166666666666666, 'number': 25}, 'CITY': {'precision': 0.9893048128342246, 'recall': 0.9840425531914894, 'f1': 0.9866666666666667, 'number': 188}, 'DATE': {'precision': 0.9914984059511158, 'recall': 0.9978609625668449, 'f1': 0.9946695095948828, 'number': 935}, 'DEPARTMENT': {'precision': 0.9478672985781991, 'recall': 0.9302325581395349, 'f1': 0.9389671361502349, 'number': 215}, 'DOCTOR': {'precision': 0.9845201238390093, 'recall': 0.9845201238390093, 'f1': 0.9845201238390093, 'number': 1292}, 'DURATION': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 3}, 'HOSPITAL': {'precision': 0.9827089337175793, 'recall': 0.9798850574712644, 'f1': 0.981294964028777, 'number': 348}, 'IDNUM': {'precision': 0.9876373626373627, 'recall': 0.9930939226519337, 'f1': 0.9903581267217632, 'number': 724}, 'LOCATION-OTHER': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'MEDICALRECORD': {'precision': 0.9971830985915493, 'recall': 0.9943820224719101, 'f1': 0.9957805907172996, 'number': 356}, 'ORGANIZATION': {'precision': 0.9090909090909091, 'recall': 0.8333333333333334, 'f1': 0.8695652173913043, 'number': 24}, 'PATIENT': {'precision': 0.9831460674157303, 'recall': 0.9915014164305949, 'f1': 0.9873060648801127, 'number': 353}, 'PHONE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'SET': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'STATE': {'precision': 0.9942528735632183, 'recall': 0.9885714285714285, 'f1': 0.991404011461318, 'number': 175}, 'STREET': {'precision': 0.9888888888888889, 'recall': 0.978021978021978, 'f1': 0.9834254143646408, 'number': 182}, 'TIME': {'precision': 0.9772727272727273, 'recall': 0.9641255605381166, 'f1': 0.9706546275395034, 'number': 223}, 'ZIP': {'precision': 1.0, 'recall': 0.9731182795698925, 'f1': 0.9863760217983651, 'number': 186}, 'overall_precision': 0.9833652007648184, 'overall_recall': 0.9835532606616944, 'overall_f1': 0.9834592217229181, 'overall_accuracy': 0.999238331528254}

In [257]:
pert_f1_df = pd.DataFrame({
    'Tag': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
    'Number': []
})

for tag, data in f1_dict.items():
    new_row = []
    overall = []

    if tag.find('overall_') != -1:
        continue

    new_row.append(tag)

    for key, val in data.items():
        new_row.append(val)
    
    pert_f1_df.loc[len(pert_f1_df.index)] = new_row

pert_f1_df.loc[len(pert_f1_df.index)] = ['', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']
pert_f1_df.loc[len(pert_f1_df.index)] = ['', 0.9833652007648184, 0.9835532606616944, 0.9834592217229181, 0.999238331528254]


In [258]:
pert_f1_df

Unnamed: 0,Tag,Precision,Recall,F1,Number
0,AGE,0.956522,0.88,0.916667,25
1,CITY,0.989305,0.984043,0.986667,188
2,DATE,0.991498,0.997861,0.99467,935
3,DEPARTMENT,0.947867,0.930233,0.938967,215
4,DOCTOR,0.98452,0.98452,0.98452,1292
5,DURATION,0.0,0.0,0.0,3
6,HOSPITAL,0.982709,0.979885,0.981295,348
7,IDNUM,0.987637,0.993094,0.990358,724
8,LOCATION-OTHER,0.0,0.0,0.0,0
9,MEDICALRECORD,0.997183,0.994382,0.995781,356


In [259]:
pert_f1_df.to_csv('./Validation_Dataset/pert_ans/f1.csv')

In [None]:
model_name = model_name.replace('/', '_')
plt.plot(f1_score, label = "f1")
# naming the x axis
plt.xlabel('epoch')
# naming the y axis
plt.ylabel('f1 score')
# giving a title to my graph
title = f'{model_name}'
plt.title(title)
# show a legend on the plot
plt.legend()
# store fig
# plt.savefig(model_name)
# function to show the plot
plt.show()
# store score
# with open(title, "wb") as fp:   #Pickling
#     pickle.dump(f1_score, fp)

NameError: name 'model_name' is not defined

## Inference

In [423]:
val_docs = {'id':[], 'doc':[]}
fnames = [f for f in os.listdir('./Validation_Dataset/Validation_Release/')]
fnames.sort()

# max_word_length = 80
# fa = open('./Second_Phase_Dataset/answer.txt', 'r')
for fname in tqdm(fnames):
    f = open(f'./Validation_Dataset/Validation_Release/{fname}', 'r')
    lines = f.read()
    # tok = lines.split()

    val_docs['id'].append(fname[:-4])
    val_docs['doc'].append(lines)

    # tok, ner = Spilt2Words(fname[:-4], f, fa)
    # if (max_word_length > 0):
    #     Segmentation(ds_dict['id'], ds_dict['tokens'], ds_dict['ner_tags'], fname[:-4], tok, ner, max_word_length)
    # else:
    #     ds_dict['id'].append(fname[:-4])
    #     ds_dict['tokens'].append(tok)
    #     ds_dict['ner_tags'].append(ner)
    f.close()

  0%|          | 0/560 [00:00<?, ?it/s]

In [424]:
import nltk
from nltk.tokenize import sent_tokenize

# Download the sentence tokenizer model (run this once)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Claire/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [359]:
# def split_documents(fnames, max_newlines_per_segment, max_sentences_per_segment):
#     result_dict = {}

#     for fname in tqdm(fnames):
#         with open(os.path.join('./Validation_Dataset/Validation_Release', fname), 'r') as file:
#             content = file.read()

#         sentences = sent_tokenize(content)
#         current_segment = []
#         segments = []
#         newline_count = 0

#         for i in range(0, len(sentences), 1):  # Increment by 1 sentence at a time
#             sentence = sentences[i]

#             while sentence.count('\n') > max_newlines_per_segment:
#                 split_index = sentence.find('\n', max_newlines_per_segment)
#                 current_segment.append(sentence[:split_index])
#                 sentence = sentence[split_index + 1:]

#                 if len(current_segment) >= max_sentences_per_segment:
#                     key = f"{fname[:-4]}_{len(segments) + 1}"
#                     result_dict[key] = '\n'.join(current_segment)
#                     current_segment = []
#                     segments.append(key)

#             current_segment.append(sentence)
#             newline_count += sentence.count('\n')

#             if newline_count >= max_newlines_per_segment or len(current_segment) >= max_sentences_per_segment or (i + 1) >= len(sentences):
#                 key = f"{fname[:-4]}_{len(segments) + 1}"
#                 result_dict[key] = '\n'.join(current_segment)
#                 current_segment = []
#                 segments.append(key)
#                 newline_count = 0

#         if current_segment:
#             key = f"{fname[:-4]}_{len(segments) + 1}"
#             result_dict[key] = '\n'.join(current_segment)
#             segments.append(key)

#     return result_dict


In [425]:
import re
def split_documents(fnames, words_per_segment):
    result_dict = {}

    for fname in tqdm(fnames):
        with open(os.path.join('./Validation_Dataset/Validation_Release', fname), 'r') as file:
            content = file.read()

        current_segment = []
        segments = []
        word_count = 0

        # Use a regular expression to split the content into words
        words = content.split(" ")

        for word in words:
            # Check if splitting is needed based on word count
            word_count += 1
            if word_count > words_per_segment:
                key = f"{fname[:-4]}_{len(segments) + 1}"
                result_dict[key] = ' '.join(current_segment)
                current_segment = []
                segments.append(key)
                word_count = 0

            current_segment.append(word)

        # Handle the remaining words after the loop
        if current_segment:
            key = f"{fname[:-4]}_{len(segments) + 1}"
            result_dict[key] = ' '.join(current_segment)
            segments.append(key)

    return result_dict

In [426]:
fnames = [f for f in os.listdir('./Validation_Dataset/Validation_Release/')]
fnames.sort()

max_lines_per_segment = 10
max_sentences_per_segment = 5
max_characters_per_segment = 100
words_per_segment = 80

# result_segments = split_documents(fnames, max_lines_per_segment, max_sentences_per_segment)

val_result_segments = split_documents(fnames, words_per_segment)

# Replace the unique token back to consecutive '\n' characters
# result_segments = {key: segment.replace('<consecutive_newlines>', '\n\n') for key, segment in result_segments.items()}



  0%|          | 0/560 [00:00<?, ?it/s]

In [427]:
# Print the first segment of the first document for demonstration
key_example = list(val_result_segments.keys())[2]
print(f"Segment {key_example}:")
print(val_result_segments[key_example])

# val_docs['doc'][0][1855:].count('\n')

Segment 1001_3:
5 - representative fallopian tube.  
FROZEN SECTION REPORT:
Signet ring carcinoma consistent with lobular carcinoma from breast (pt has past history of lobular Ca with 33 pos LN in 2059)
Result to Dr Sek via anaesthetist by A/Prof X Standrew at 9:30am on 18/3/14.  
  
B.  Specimen labelled "Left tube and ovary" consists of an ovary measuring 45 x 30 x 15mm with attached fallopian tube 40mm in length and up to 12mm in diameter.   Slicing the


In [428]:
list[val_result_segments.keys()]

list[dict_keys(['1001_1', '1001_2', '1001_3', '1001_4', '1001_5', '1001_6', '1001_7', '1002_1', '1002_2', '1002_3', '1002_4', '1002_5', '1002_6', '1003_1', '1003_2', '1003_3', '1003_4', '1003_5', '1003_6', '1003_7', '1004_1', '1004_2', '1004_3', '1004_4', '1004_5', '1004_6', '1005_1', '1005_2', '1005_3', '1005_4', '1005_5', '1005_6', '1005_7', '1005_8', '1005_9', '1005_10', '1005_11', '1005_12', '1006_1', '1006_2', '1006_3', '1006_4', '1007_1', '1007_2', '1007_3', '1007_4', '1007_5', '1007_6', '1008_1', '1008_2', '1008_3', '1009_1', '1009_2', '1009_3', '1009_4', '1011_1', '1011_2', '1011_3', '1011_4', '1011_5', '1011_6', '1011_7', '1011_8', '1011_9', '1011_10', '1011_11', '1011_12', '1012_1', '1012_2', '1012_3', '1012_4', '1012_5', '1013_1', '1013_2', '1013_3', '1013_4', '1013_5', '1014_1', '1014_2', '1014_3', '1014_4', '1014_5', '1014_6', '1014_7', '1014_8', '1014_9', '1014_10', '1014_11', '1015_1', '1015_2', '1019_1', '1019_2', '1019_3', '1019_4', '1019_5', '1021_1', '1021_2', '1021_

In [429]:
# print(len(sent_tokenize(result_segments['650_8'])))
print(val_result_segments['file21703_12'])

Present

Tumour infiltrating lymphocytes: Low density (&amp;lt;5/hpf)

Excision Margins:
Proximal:  100mm
Distal:  100mm
Radial:  Involved (tumour &amp;lt;1mm to radial margin)
Donuts:  Not received

Lymph nodes:
Number positive:  0 (but with extranodal deposits of carcinoma in pericolic adipose tissue - pN1c)
Total number:  12
Apical node:  Not identified

Intramural vein invasion:  Not seen

Extramural vein invasion:  Not seen

Small vessel (lymphatic) invasion:  Suspicious

Perineural invasion:  Not seen

Distant metastases:  Present - omentum, splenic serosa, peritoneum, pleura

Other findings:
Remote colon:  Unremarkable

Treatment effect:  N/A

Ancillary studies: 


In [430]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./models/ner/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [431]:
val_result_ans_dict = {}

for fid_sid, seg in val_result_segments.items():
    try:
        val_result_ans_dict[fid_sid] = token_classifier(seg)
    except:
        print(fid_sid)
        break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [432]:
val_result_ans_dict['1002_6']

[{'entity_group': 'OTHER',
  'score': 0.9997768,
  'word': '- low grade mucinous neoplasm resulting in pseudomyxoma peritonei involving peritoneal fragments and colonic mesentery. para - aortic lymph node : no evidence of malignancy.',
  'start': 2,
  'end': 174},
 {'entity_group': 'DATE',
  'score': 0.99546033,
  'word': '27 / 5 / 64',
  'start': 175,
  'end': 182},
 {'entity_group': 'DOCTOR',
  'score': 0.8952514,
  'word': 'ht',
  'start': 183,
  'end': 185}]

In [396]:
# result_ans_dict_cpy = result_ans_dict.copy()


In [260]:
# from dateparser import parse
# import dateutil.parser as parser

# def handleTimeWithHr(time_str):
#     if 'hrs' in time_str:
#         hr_pos = time_str.find('hrs')
#         time_str = time_str.replace('hrs', '')
#     elif 'Hrs' in time_str:
#         hr_pos = time_str.find('Hrs')
#         time_str = time_str.replace('Hrs', '')
#     elif 'hr' in time_str :
#         hr_pos = time_str.find('hr')
#         time_str = time_str.replace('hr', '')
#     elif 'Hr' in time_str:
#         hr_pos = time_str.find('Hr')
#         time_str = time_str.replace('Hr', '')
#     elif 'hours' in time_str:
#         hr_pos = time_str.find('hours')
#         time_str = time_str.replace('hours', '')
#     elif 'Hours' in time_str:
#         hr_pos = time_str.find('Hours')
#         time_str = time_str.replace('Hours', '')
#     elif 'our' in time_str:
#         hr_pos = time_str.find('hour')
#         time_str = time_str.replace('hour', '')
#     elif 'Hour' in time_str:
#         hr_pos = time_str.find('Hour')
#         time_str = time_str.replace('Hour', '')
#     else:
#         return time_str

#     if time_str[hr_pos-3].isdigit():
#         time_str = time_str[:hr_pos-2] + ':' + time_str[hr_pos-2:]
#     else:
#         time_str = time_str.replace(time_str[hr_pos-3], ':')
#     return time_str

# def time2iso(time_str):
#     time_str = handleTimeWithHr(time_str)
#     try:
#         if '-' in time_str and time_str.count(':') == 2:
#             time_norm_str = parse(time_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="seconds")
#         elif '-' in time_str:
#             time_norm_str = parse(time_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="minutes")
#         else:
#             time_norm_str = parse(time_str, settings={'DATE_ORDER': 'DMY'}).isoformat(timespec="minutes")
#         return time_norm_str
#     except:
#         return time_str

# def date2iso(date_str):
#     try:
#         # print(date_str)
#         if date_str.isdigit() and len(date_str) == 8:
#             # print('before', date_str)
#             date_str = date_str[:4] + '/' + date_str[4:6] + "/" + date_str[6:]
#             # print('after', date_str)
#             iso_str = parse(date_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="hours")
#         elif date_str.isdigit() and len(date_str) == 4:
#             iso_str = parse(date_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="hours")
#             us_pos = iso_str.find('-')
#             return iso_str[:us_pos]
#         else:
#             iso_str = parse(date_str, settings={'DATE_ORDER': 'DMY'}).isoformat(timespec="hours")
#         # print(iso_str)
#         T_pos = iso_str.find('T')
#         return iso_str[:T_pos]
#     except:
#         return date_str

# import re
# from isodate import parse_duration
# def duration2iso(duration_description):
#     try:
#         # Remove non-alphanumeric characters
#         duration_description = re.sub(r'\W+', '', duration_description)
        
#         # Handle 'year' and 'yr' cases
#         if 'year' in duration_description or 'yr' in duration_description:
#             years = re.search(r'\d+', duration_description).group()
#             return f'P{years}Y'
#         elif 'month' in duration_description or 'wk' in duration_description:
#             months = re.search(r'\d+', duration_description).group()
#             return f'P{months}M'
#         elif 'day' in duration_description:
#             days = re.search(r'\d+', duration_description).group()
#             return f'P{days}D'
#         elif 'week' in duration_description or 'wk' in duration_description:
#             weeks = re.search(r'\d+', duration_description).group()
#             return f'P{weeks}W'
#         else:
#             return duration_description
#     except:
#         return duration_description


# def set2iso(set_str):
#     if set_str == 'twice':
#         return 'R2'
#     elif set_str == 'once':
#         return 'R1'
#     else:
#         return set_str

In [433]:
import re
from word2number import w2n

def Normalize(time_type, org):
    nor = ''
    if (time_type == 'DATE'):
        if (re.match('\d{1,2}(\/|\.| |-|,)\d{1,2}(\/|\.| |-|,)\d{2,4}', org)):
            l = re.split('\/|\.| |-|,', org)
            if (len(l[2]) == 2):
                l[2] = '20' + l[2]
            elif (len(l[2]) == 3):
                l[2] = '2' + l[2]
            if (len(l[1]) == 1):
                l[1] = '0' + l[1]
            if (len(l[0]) == 1):
                l[0] = '0' + l[0]
            nor = l[2] + '-' + l[1] + '-' + l[0]
        elif (re.match('\/\d{1,2}\/(\d{2}|\d{4})', org)):
            l = re.split('\/', org)
            if (len(l[1]) == 1):
                l[1] = '0' + l[1]
            if (len(l[2]) == 2):
                l[2] = '20' + l[2]
            nor = l[2] + '-' + l[1]
        elif (re.match('\d{1,2}\/\d{2,5}', org)):
            l = re.split('\/', org)
            if (len(l[0]) == 1):
                l[0] = '0' + l[0]
            if (len(l[1]) == 2):
                nor = '20' + l[1] + '-' + l[0]
            elif (len(l[1]) == 3):
                nor = '20' + l[1][1:] + '-' + '0' + l[1][0] + '-' + l[0]
            elif (len(l[1]) == 4):
                nor = l[1] + '-' + l[0]
            elif (len(l[1]) == 5):
                nor = l[1][1:] + '-' + '0' + l[1][0] + '-' + l[0]
        elif (re.match('\d{8}', org)):
            nor = org[0:4] + '-' + org[4:6] + '-' + org[6:8]
        elif (re.match('\d{4}', org)):
            nor = org
        elif (re.match('\d{3}', org)):
            nor = '2' + org
        elif (re.match('(\d{2}|)(-|)(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(-| )\d{2,4}', org)):
            org = org.replace('Jan', '01')
            org = org.replace('Feb', '02')
            org = org.replace('Mar', '03')
            org = org.replace('Apr', '04')
            org = org.replace('May', '05')
            org = org.replace('Jun', '06')
            org = org.replace('Jul', '07')
            org = org.replace('Aug', '08')
            org = org.replace('Sep', '09')
            org = org.replace('Oct', '10')
            org = org.replace('Nov', '11')
            org = org.replace('Dec', '12')
            l = re.split('-| ', org)
            if (len(l) == 2):
                if (len(l[1]) == 2):
                    l[1] = '20' + l[1]
                elif (len(l[1]) == 3):
                    l[1] = '2' + l[1]
                nor = l[1] + '-' + l[0]
            else:
                if (len(l[2]) == 2):
                    l[2] = '20' + l[2]
                elif (len(l[2]) == 3):
                    l[2] = '2' + l[2]
                nor = l[2] + '-' + l[1] + '-' + l[0]
        elif (re.match('\d{1,2}((st)|(nd)|(rd)|(th)) of (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}', org)):
            org = org.replace('January', '01')
            org = org.replace('Feburary', '02')
            org = org.replace('March', '03')
            org = org.replace('April', '04')
            org = org.replace('May', '05')
            org = org.replace('June', '06')
            org = org.replace('July', '07')
            org = org.replace('August', '08')
            org = org.replace('September', '09')
            org = org.replace('October', '10')
            org = org.replace('November', '11')
            org = org.replace('December', '12')
            l = re.split(' ', org)
            nor = l[3] + '-' + l[2] + '-' + l[0][:-2]
        elif (re.match('(\d{1,2}|)( |)(January|February|March|April|May|June|July|August|September|October|November|December) \d{4}', org)):
            if (re.match('\d', org[0]) and re.match('\d', org[1]) == None):
                org = '0' + org
            org = org.replace('January', '01')
            org = org.replace('Feburary', '02')
            org = org.replace('March', '03')
            org = org.replace('April', '04')
            org = org.replace('May', '05')
            org = org.replace('June', '06')
            org = org.replace('July', '07')
            org = org.replace('August', '08')
            org = org.replace('September', '09')
            org = org.replace('October', '10')
            org = org.replace('November', '11')
            org = org.replace('December', '12')
            org = org.replace(' ', '')
            if (len(org) == 6):
                nor = org[2:] + '-' + org[0:2]
            else:    
                nor = org[4:] + '-' + org[2:4] + '-' + org[0:2]
    elif (time_type == 'TIME'):
        if (re.match('(\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}(  | |)|)(at|)( |)\d{1,2}(:|\.)\d{2}(AM|am|PM|pm|Hr|Hrs|hr|hrs|)( on the \d{1,2}((st)|(nd)|(rd)|(th)) of (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}|)', org)):
            tmp = org
            pm = 0
            am = 0
            if (re.search('PM', org, flags=0) != None):
                pm = 1
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('AM', org, flags=0) != None):
                am = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            get_date = 0
            date = re.search('\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org, flags=0)
            if (date != None):
                date = date.group(0)
                org = org.replace(date, '')
                date = re.split('\/|\.', date)
                if (len(date[0]) == 1):
                    date[0] = '0' + date[0]
                if (len(date[1]) == 1):
                    date[1] = '0' + date[1]
                if (len(date[2]) == 2):
                    date[2] = '20' + date[2]
                elif (len(date[2]) == 3):
                    date[2] = '2' + date[2]
                nor = date[2] + '-' + date[1] + '-' + date[0]
                get_date = 1
            yyyy = re.search('\d{4}', org, flags=0)
            if (yyyy != None and get_date == 0):
                yyyy = yyyy.group(0)
                org = org.replace(yyyy, '')
                nor = yyyy + '-'
            mm = re.search('January|February|March|April|May|June|July|August|September|October|November|December', org, flags=0)
            if (mm != None and get_date == 0):
                mm = mm.group(0)
                org = org.replace(mm, '')
                mm = mm.replace('January', '01')
                mm = mm.replace('Feburary', '02')
                mm = mm.replace('March', '03')
                mm = mm.replace('April', '04')
                mm = mm.replace('May', '05')
                mm = mm.replace('June', '06')
                mm = mm.replace('July', '07')
                mm = mm.replace('August', '08')
                mm = mm.replace('September', '09')
                mm = mm.replace('October', '10')
                mm = mm.replace('November', '11')
                mm = mm.replace('December', '12')
                nor = nor + mm + '-'
            dd = re.search('\d{1,2}((st)|(nd)|(rd)|(th))', org, flags=0)
            if (dd != None and get_date == 0):
                dd = dd.group(0)
                org = org.replace(dd, '')
                dd = dd.replace('st', '')
                dd = dd.replace('nd', '')
                dd = dd.replace('rd', '')
                dd = dd.replace('th', '')
                if (len(dd) == 1):
                    dd = '0' + dd
                nor = nor + dd
            get_time = 0
            time = re.search('\d{1,2}(:|\.)\d{1,2}', org, flags=0)
            if (time != None):
                time = time.group(0)
                org = org.replace(time, '')
                time = re.split('\.|:', time)
                if (pm == 1 and int(time[0]) < 12):
                    time[0] = str(int(time[0]) + 12)
                elif (am == 1 and int(time[0]) == 12):
                    time[0] = '00'
                if (len(time[0]) == 1):
                    time[0] = '0' + time[0]
                nor = nor + 'T' + time[0] + ':' + time[1]
                get_time = 1
            pm = 0
            am = 0
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            time = re.search('\d{1,4}', org, flags=0)
            if (time != None and get_time == 0):
                time = time.group(0)
                org = org.replace(time, '')
                hh, mm = '00', '00'
                if (len(time) == 4):
                    hh = time[0:2]
                    mm = time[2:]
                elif (len(time) == 3):
                    hh = time[0]
                    mm = time[1:]
                elif (len(time) == 2):
                    hh = time
                elif (len(time) == 1):
                    hh = time
                if (pm == 1 and int(hh) < 12):
                    hh = str(int(hh) + 12)
                elif (am == 1 and int(hh) == 12):
                    hh = '00'
                nor = nor + 'T' + hh + ':' + mm    
            #if (nor != ans):    
                #print(f'1:nor={nor}, ans={ans}, org={tmp}')
        elif (re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', org)):
            tmp = org
            nor = org.replace(' ', 'T')
            #if (nor != ans):    
                #print(f'2:nor={nor}, ans={ans}, org={tmp}')
        elif (re.match('(at |)(\d{1,2}|)(:|\.|)\d{2}( |)(am|pm|Hr|Hrs|hr|hrs|)( on | )(the |)\d{1,2}(\/|\.)\d{2,4}(\/|\.)\d{1,2}', org)):
            tmp = org
            pm = 0
            am = 0
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            date = re.search('\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org, flags=0)
            if (date != None):
                date = date.group(0)
                org = org.replace(date, '')
                date = re.split('\/|\.', date)
                if (len(date[0]) == 1):
                    date[0] = '0' + date[0]
                if (len(date[1]) == 1):
                    date[1] = '0' + date[1]
                if (len(date[2]) == 2):
                    date[2] = '20' + date[2]
                elif (len(date[2]) == 3):
                    date[2] = '2' + date[2]
                nor = date[2] + '-' + date[1] + '-' + date[0] + 'T'
            org = org.replace(':', '')
            time = re.search('\d{1,4}', org, flags=0)
            if (time != None):
                time = time.group(0)
                org = org.replace(time, '')
                hh, mm = '00', '00'
                if (len(time) == 4):
                    hh = time[0:2]
                    mm = time[2:]
                elif (len(time) == 3):
                    hh = time[0]
                    mm = time[1:]
                elif (len(time) == 2):
                    hh = time
                elif (len(time) == 1):
                    hh = time
                if (pm == 1 and int(hh) < 12):
                    hh = str(int(hh) + 12)
                elif (am == 1 and int(hh) == 12):
                    hh = '00'
                nor = nor + hh + ':' + mm
            #if (nor != ans):    
                #print(f'3:nor={nor}, ans={ans}, org={tmp}')
        elif (re.match('((\d{1,2}((pm)|(am)))|(\d{4}(Hr|Hrs|hr|hrs|)))(( on )| )\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org)):
            tmp = org
            pm = 0
            am = 0
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            date = re.search('\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org, flags=0)
            if (date != None):
                date = date.group(0)
                org = org.replace(date, '')
                date = re.split('\/|\.', date)
                if (len(date[0]) == 1):
                    date[0] = '0' + date[0]
                if (len(date[1]) == 1):
                    date[1] = '0' + date[1]
                if (len(date[2]) == 2):
                    date[2] = '20' + date[2]
                elif (len(date[2]) == 3):
                    date[2] = '2' + date[2]
                nor = date[2] + '-' + date[1] + '-' + date[0] + 'T'
            hrtime = re.search('\d{4}', org, flags=0)
            if (hrtime != None):
                hrtime = hrtime.group(0)
                org = org.replace(hrtime, '')
                nor = nor + hrtime[0:2] + ':' + hrtime[2:]
            time = re.search('\d{1,2}', org, flags=0)
            if (time != None):
                time = time.group(0)
                org = org.replace(time, '')
                hh = time
                if (pm == 1 and int(hh) < 12):
                    hh = str(int(hh) + 12)
                elif (am == 1 and int(hh) == 12):
                    hh = '00'
                if (len(hh) == 1):
                    hh = '0' + hh
                nor = nor + hh + ':' + '00'
            #if (nor != ans):    
                #print(f'4:nor={nor}, ans={ans}, org={tmp}')
    elif (time_type == 'DURATION'):   
        tmp = org
        org = org.replace('one', '1')
        org = org.replace('two', '2')
        org = org.replace('three', '3')
        org = org.replace('four', '4')
        org = org.replace('five', '5')
        num = ''
        alp = ''
        space_idx = org.find(' ')
        for i in range(len(org)):
            if (org[i] == 'D' or org[i] == 'd' or\
                org[i] == 'W' or org[i] == 'w' or\
                org[i] == 'M' or org[i] == 'm' or\
                org[i] == 'Y' or org[i] == 'y') and i > space_idx:
                alp = org[i]
                org = org[:i]
                break
        # print(org, alp)
        org = re.split('-| ', org)
        try:
            if org[0].isalpha():
                org[0] = w2n.word_to_num(org[0])
            # print(org)
            if (len(org) == 1 or org[1] == ''):
                nor = 'P' + str(org[0]) + alp.upper()
            else:
                nor = 'P' + str((int(org[0]) + int(org[1])) / 2) + alp.upper()
        except:
            nor = tmp
        # if (nor != ans):    
        #     print(f'dur:nor={nor}, ans={ans}, org={tmp}')
    elif (time_type == 'SET'):
        if (re.match('twice', org)):
            nor = 'R2'
    return nor


In [434]:
import pandas as pd

In [435]:
val_ans_df = pd.DataFrame({
    'file_id': [],
    'PHI_type': [],
    'PHI_start': [],
    'PHI_end': [],
    'PHI_content': [],
    'ISO': []
})

In [464]:
val_ans_df = pd.DataFrame({
    'file_id': [],
    'PHI_type': [],
    'PHI_start': [],
    'PHI_end': [],
    'PHI_content': [],
    'ISO': []
})

last_fid = ""
last_idx_of_last_seg = 0
for fid_sid, entities in val_result_ans_dict.items():
    curr_fid = fid_sid.split('_')[0]
    curr_sid = fid_sid.split('_')[1]
    # print(fid_sid)

    if curr_fid != last_fid:
        with open(os.path.join('./Validation_Dataset/Validation_Release', curr_fid+'.txt'), 'r') as file:
            content = file.read()
        last_fid = curr_fid
        last_idx_of_last_seg = 0

    # last_idx_of_last_seg = 0

    for i, entity in enumerate(entities):
        new_row = []
        # print(i, entity)

        if i == len(entities) - 1 and entity['entity_group'] == 'OTHER':
            last_idx_of_last_seg += len(val_result_segments[fid_sid])
            continue
        elif entity['entity_group'] != 'OTHER':
            # print(fid_sid)
            # print(last_idx_of_last_seg)
            start_idx = entity['start'] + last_idx_of_last_seg + int(curr_sid) - 1
            end_idx = entity['end'] + last_idx_of_last_seg + int(curr_sid) - 1
            # print('start', start_idx)
            # print(entity['word'])
            # find_idx = content.lower()[start_idx:].find(entity['word']) + start_idx
            # print('find', find_idx)
            # end_idx = find_idx+len(entity['word'])
            word = content[start_idx:end_idx]

            if i == len(entities) - 1:
                last_idx_of_last_seg += len(val_result_segments[fid_sid])

            if len(word) > 1:
                # print(word, start_idx, end_idx)
                while word[0].isalnum() == False or word[-1].isalnum() == False:
                    if word[0].isalnum() == False:
                        word = word[1:]
                        start_idx += 1
                    # print(word, start_idx, end_idx)
                    if word[-1].isalnum() == False:
                        word = word[:-1]
                        end_idx -= 1
            
            if '\n' in word:
                word = word.replace('\n', ' ')

            label = entity['entity_group']

            have_num_or_aplha_desc = False
            if word[0].isdigit() and word[-1].isalpha and content[start_idx-1] == ' ': 
                have_num_or_aplha_desc = True
            elif word.find(' ') != -1:
                have_num_or_aplha_desc = True

            if entity['entity_group'] == 'DURATION' and word != 'twice' and have_num_or_aplha_desc == False:
                word_cpy = word.lower()
                if word_cpy.find('yr') != -1 or word_cpy.find('ye') != -1 or word_cpy.find('m') != -1 or word_cpy.find('w') != -1:
                    last_index = start_idx - 1
                    # case 1: no spcae between number and month, week or year
                    if content[last_index].isdigit():
                        while content[last_index].isdigit():
                            last_index -= 1
                        start_idx = last_index + 1
                        word = content[start_idx:end_idx]
                    else:
                        last_space1 = content.rfind(' ', 0, start_idx)
                        last_space2 = content.rfind(' ', 0, last_space1)
                        start_idx = last_space2 + 1
                        word = content[start_idx:end_idx]
                elif word.isdigit():
                    next_index = end_idx + 1
                    if content[next_index].isalpha():
                        while content[next_index].isalpha():
                            next_index += 1
                        end_idx = next_index
                        word = content[start_idx:end_idx]
                    else:
                        first_space_index = content.find(' ', end_idx)
                        # Find the index of the second space after the target word
                        second_space_index = content.find(' ', first_space_index + 1)
                        end_idx = second_space_index
                        word = content[start_idx:end_idx]   
            elif word == 'twice':
                label = 'SET'
            # if word == 'twice':
            #     label = 'SET'
            
            
            new_row.extend([curr_fid, label, start_idx, end_idx, word])
            

            need_iso = ['DATE', 'TIME', 'DURATION', 'SET']

            if entity['entity_group'] in need_iso:
                new_row.append(Normalize(label, word))
            else:
                new_row.append('')
            
            val_ans_df.loc[len(val_ans_df)] = new_row
        else:
            continue


12
52


In [462]:
val_ans_df

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1001,IDNUM,13,23,88Y206206L,
1,1001,MEDICALRECORD,24,35,8892062.BPL,
2,1001,PATIENT,37,63,"Vatterott, Jerrie CLARENCE",
3,1001,IDNUM,74,82,88Y20620,
4,1001,IDNUM,83,91,88Y20620,
...,...,...,...,...,...,...
8669,file9762,DATE,1861,1868,28/8/68,2068-08-28
8670,file9762,DOCTOR,1904,1912,D Scobie,
8671,file9762,DOCTOR,1920,1929,D D Toren,
8672,file9762,TIME,3664,3683,2015-05-20 00:00:00,2015-05-20T00:00:00


In [463]:
val_ans_df.loc[val_ans_df['PHI_type'] == 'DURATION']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
378,1029,DURATION,375,381,12 ago,12 ago
822,1055,DURATION,462,475,several years,several years
1106,1068,DURATION,468,474,52 ago,52 ago
1564,180,DURATION,363,370,2 years,P2Y
2669,689,DURATION,372,381,18 months,P18M
5501,887,DURATION,410,411,4,P4
5502,887,DURATION,412,419,5 month,P5M
8312,file30591,DURATION,232,239,6 weeks,P6W


In [None]:
# with open('')

In [441]:
df_crf_val = pd.read_csv('./Validation_Dataset/prediction.csv')
df_crf_duration = df_crf_val.loc[df_crf_val['PHI_type'] == 'DURATION']
df_crf_lo = df_crf_val.loc[df_crf_val['PHI_type'] == 'LOCATION-OTHER']

In [443]:
df_crf_duration

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
2168,689,DURATION,372,381,18 months,P18M
4621,894,DURATION,358,363,16 yr,P16Y
6775,file30591,DURATION,232,239,6 weeks,P6W


In [267]:
val_ans_df.to_csv('./Validation_Dataset/pert_ans/pert_answer.csv')

In [268]:
val_ans_df.to_csv('./Validation_Dataset/pert_ans/pert_answer.txt', sep='\t', header=False, index=False)

## Inference: Test set

In [269]:
test_docs = {'id':[], 'doc':[]}
fnames = [f for f in os.listdir('./opendid_test/opendid_test/')]
fnames.sort()

# max_word_length = 80
# fa = open('./Second_Phase_Dataset/answer.txt', 'r')
for fname in tqdm(fnames):
    f = open(f'./opendid_test/opendid_test/{fname}', 'r')
    lines = f.read()
    # tok = lines.split()

    test_docs['id'].append(fname[:-4])
    test_docs['doc'].append(lines)

    # tok, ner = Spilt2Words(fname[:-4], f, fa)
    # if (max_word_length > 0):
    #     Segmentation(ds_dict['id'], ds_dict['tokens'], ds_dict['ner_tags'], fname[:-4], tok, ner, max_word_length)
    # else:
    #     ds_dict['id'].append(fname[:-4])
    #     ds_dict['tokens'].append(tok)
    #     ds_dict['ner_tags'].append(ner)
    f.close()

  0%|          | 0/950 [00:00<?, ?it/s]

In [270]:
import nltk
from nltk.tokenize import sent_tokenize

# Download the sentence tokenizer model (run this once)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Claire/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [271]:
import re
def split_documents(fnames, words_per_segment):
    result_dict = {}

    for fname in tqdm(fnames):
        with open(os.path.join('./opendid_test/opendid_test', fname), 'r') as file:
            content = file.read()

        current_segment = []
        segments = []
        word_count = 0

        # Use a regular expression to split the content into words
        words = content.split(" ")

        for word in words:
            # Check if splitting is needed based on word count
            word_count += 1
            if word_count > words_per_segment:
                key = f"{fname[:-4]}_{len(segments) + 1}"
                result_dict[key] = ' '.join(current_segment)
                current_segment = []
                segments.append(key)
                word_count = 0

            current_segment.append(word)

        # Handle the remaining words after the loop
        if current_segment:
            key = f"{fname[:-4]}_{len(segments) + 1}"
            result_dict[key] = ' '.join(current_segment)
            segments.append(key)

    return result_dict

In [272]:
fnames = [f for f in os.listdir('./opendid_test/opendid_test/')]
fnames.sort()

max_lines_per_segment = 10
max_sentences_per_segment = 5
max_characters_per_segment = 100
words_per_segment = 80

# result_segments = split_documents(fnames, max_lines_per_segment, max_sentences_per_segment)

result_segments = split_documents(fnames, words_per_segment)

# Replace the unique token back to consecutive '\n' characters
# result_segments = {key: segment.replace('<consecutive_newlines>', '\n\n') for key, segment in result_segments.items()}



  0%|          | 0/950 [00:00<?, ?it/s]

In [274]:
# Print the first segment of the first document for demonstration
key_example = list(result_segments.keys())[0]
print(f"Segment {key_example}:")
print(result_segments[key_example])

# val_docs['doc'][0][1855:].count('\n')

Segment 1097_1:
 433475.RDC
Timmins, ELDEN
43J47561,43J47561

Last edited : 7/9/2063		Page: 2
CLINICAL:
Metastatic cancer ?colorectal primary. 
MACROSCOPIC:
Specimen labelled "Omentum secondary", consists of a piece of omentum 120 x 100 x 30mm.  On sectioning there are multiple fibrotic white ill-defined nodules identified.  
Blocks: 1 to 5 - representative sections from the nodules. Block 3 reserved block.
(IC/vo 5.9.63)
MICROSCOPIC (Reported by Dr L Bonnot):
Sections show omental fat with metastatic adenocarcinoma. The lesion has a complex glandular architecture, and is composed of tumour cells with pleomorphic, vesicular nuclei.
The


In [None]:
list[result_segments.keys()]

list[dict_keys(['1001_1', '1001_2', '1001_3', '1001_4', '1001_5', '1001_6', '1001_7', '1001_8', '1002_1', '1002_2', '1002_3', '1002_4', '1002_5', '1002_6', '1002_7', '1003_1', '1003_2', '1003_3', '1003_4', '1003_5', '1003_6', '1003_7', '1003_8', '1003_9', '1003_10', '1003_11', '1004_1', '1004_2', '1004_3', '1004_4', '1004_5', '1004_6', '1005_1', '1005_2', '1005_3', '1005_4', '1005_5', '1005_6', '1005_7', '1005_8', '1005_9', '1005_10', '1005_11', '1005_12', '1005_13', '1005_14', '1005_15', '1005_16', '1005_17', '1005_18', '1006_1', '1006_2', '1006_3', '1006_4', '1006_5', '1007_1', '1007_2', '1007_3', '1007_4', '1007_5', '1008_1', '1008_2', '1008_3', '1008_4', '1009_1', '1009_2', '1009_3', '1009_4', '1009_5', '1009_6', '1011_1', '1011_2', '1011_3', '1011_4', '1011_5', '1011_6', '1011_7', '1011_8', '1011_9', '1011_10', '1011_11', '1011_12', '1011_13', '1011_14', '1011_15', '1012_1', '1012_2', '1012_3', '1012_4', '1012_5', '1013_1', '1013_2', '1013_3', '1013_4', '1013_5', '1014_1', '1014_2

In [None]:
# print(len(sent_tokenize(result_segments['650_8'])))
print(result_segments['file21703_12'])

On opening, there is a circumferencial polypoid tumour centrally.
It measures 30mm in length with a maximum diameter of 30mm.
It is 100mm from the proximal margin, 100mm from the distal margin and 3mm from the radial margin.
The tumour is entirely above the anterior peritoneal reflection.
The mesorectum is complete.


In [275]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./models/ner/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [276]:
result_ans_dict = {}

for fid_sid, seg in result_segments.items():
    try:
        result_ans_dict[fid_sid] = token_classifier(seg)
    except:
        print(fid_sid)
        break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# result_ans_dict['1002_6']

[{'entity_group': 'OTHER',
  'score': 0.9997768,
  'word': '- low grade mucinous neoplasm resulting in pseudomyxoma peritonei involving peritoneal fragments and colonic mesentery. para - aortic lymph node : no evidence of malignancy.',
  'start': 2,
  'end': 174},
 {'entity_group': 'DATE',
  'score': 0.99546033,
  'word': '27 / 5 / 64',
  'start': 175,
  'end': 182},
 {'entity_group': 'DOCTOR',
  'score': 0.8952514,
  'word': 'ht',
  'start': 183,
  'end': 185}]

In [277]:
from dateparser import parse
import dateutil.parser as parser


def handleTimeWithHr(time_str):
    if 'hrs' in time_str:
        hr_pos = time_str.find('hrs')
        time_str = time_str.replace('hrs', '')
    elif 'Hrs' in time_str:
        hr_pos = time_str.find('Hrs')
        time_str = time_str.replace('Hrs', '')
    elif 'hr' in time_str :
        hr_pos = time_str.find('hr')
        time_str = time_str.replace('hr', '')
    elif 'Hr' in time_str:
        hr_pos = time_str.find('Hr')
        time_str = time_str.replace('Hr', '')
    elif 'hours' in time_str:
        hr_pos = time_str.find('hours')
        time_str = time_str.replace('hours', '')
    elif 'Hours' in time_str:
        hr_pos = time_str.find('Hours')
        time_str = time_str.replace('Hours', '')
    elif 'our' in time_str:
        hr_pos = time_str.find('hour')
        time_str = time_str.replace('hour', '')
    elif 'Hour' in time_str:
        hr_pos = time_str.find('Hour')
        time_str = time_str.replace('Hour', '')
    else:
        return time_str

    if time_str[hr_pos-3].isdigit():
        time_str = time_str[:hr_pos-2] + ':' + time_str[hr_pos-2:]
    else:
        time_str = time_str.replace(time_str[hr_pos-3], ':')
    return time_str

def time2iso(time_str):
    time_str = handleTimeWithHr(time_str)
    try:
        if '-' in time_str and time_str.count(':') == 2:
            time_norm_str = parse(time_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="seconds")
        elif '-' in time_str:
            time_norm_str = parse(time_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="minutes")
        else:
            time_norm_str = parse(time_str, settings={'DATE_ORDER': 'DMY'}).isoformat(timespec="minutes")
        return time_norm_str
    except:
        return time_str

def date2iso(date_str):
    try:
        # print(date_str)
        if date_str.isdigit() and len(date_str) == 8:
            # print('before', date_str)
            date_str = date_str[:4] + '/' + date_str[4:6] + "/" + date_str[6:]
            # print('after', date_str)
            iso_str = parse(date_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="hours")
        elif date_str.isdigit() and len(date_str) == 4:
            iso_str = parse(date_str, settings={'DATE_ORDER': 'YMD'}).isoformat(timespec="hours")
            us_pos = iso_str.find('-')
            return iso_str[:us_pos]
        else:
            iso_str = parse(date_str, settings={'DATE_ORDER': 'DMY'}).isoformat(timespec="hours")
        # print(iso_str)
        T_pos = iso_str.find('T')
        return iso_str[:T_pos]
    except:
        return date_str

import re
from isodate import parse_duration
def duration2iso(duration_description):
    try:
        # Remove non-alphanumeric characters
        duration_description = re.sub(r'\W+', '', duration_description)
        
        # Handle 'year' and 'yr' cases
        if 'year' in duration_description or 'yr' in duration_description:
            years = re.search(r'\d+', duration_description).group()
            return f'P{years}Y'
        elif 'month' in duration_description or 'wk' in duration_description:
            months = re.search(r'\d+', duration_description).group()
            return f'P{months}M'
        elif 'day' in duration_description:
            days = re.search(r'\d+', duration_description).group()
            return f'P{days}D'
        elif 'week' in duration_description or 'wk' in duration_description:
            weeks = re.search(r'\d+', duration_description).group()
            return f'P{weeks}W'
        else:
            return duration_description
    except:
        return duration_description


def set2iso(set_str):
    if set_str == 'twice':
        return 'R2'
    elif set_str == 'once':
        return 'R1'
    else:
        return set_str

In [353]:
import re
from word2number import w2n

def Normalize(time_type, org):
    nor = ''
    if (time_type == 'DATE'):
        if (re.match('\d{1,2}(\/|\.| |-|,)\d{1,2}(\/|\.| |-|,)\d{2,4}', org)):
            l = re.split('\/|\.| |-|,', org)
            if (len(l[2]) == 2):
                l[2] = '20' + l[2]
            elif (len(l[2]) == 3):
                l[2] = '2' + l[2]
            if (len(l[1]) == 1):
                l[1] = '0' + l[1]
            if (len(l[0]) == 1):
                l[0] = '0' + l[0]
            nor = l[2] + '-' + l[1] + '-' + l[0]
        elif (re.match('\/\d{1,2}\/(\d{2}|\d{4})', org)):
            l = re.split('\/', org)
            if (len(l[1]) == 1):
                l[1] = '0' + l[1]
            if (len(l[2]) == 2):
                l[2] = '20' + l[2]
            nor = l[2] + '-' + l[1]
        elif (re.match('\d{1,2}\/\d{2,5}', org)):
            l = re.split('\/', org)
            if (len(l[0]) == 1):
                l[0] = '0' + l[0]
            if (len(l[1]) == 2):
                nor = '20' + l[1] + '-' + l[0]
            elif (len(l[1]) == 3):
                nor = '20' + l[1][1:] + '-' + '0' + l[1][0] + '-' + l[0]
            elif (len(l[1]) == 4):
                nor = l[1] + '-' + l[0]
            elif (len(l[1]) == 5):
                nor = l[1][1:] + '-' + '0' + l[1][0] + '-' + l[0]
        elif (re.match('\d{8}', org)):
            nor = org[0:4] + '-' + org[4:6] + '-' + org[6:8]
        elif (re.match('\d{4}', org)):
            nor = org
        elif (re.match('\d{3}', org)):
            nor = '2' + org
        elif (re.match('(\d{2}|)(-|)(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(-| )\d{2,4}', org)):
            org = org.replace('Jan', '01')
            org = org.replace('Feb', '02')
            org = org.replace('Mar', '03')
            org = org.replace('Apr', '04')
            org = org.replace('May', '05')
            org = org.replace('Jun', '06')
            org = org.replace('Jul', '07')
            org = org.replace('Aug', '08')
            org = org.replace('Sep', '09')
            org = org.replace('Oct', '10')
            org = org.replace('Nov', '11')
            org = org.replace('Dec', '12')
            l = re.split('-| ', org)
            if (len(l) == 2):
                if (len(l[1]) == 2):
                    l[1] = '20' + l[1]
                elif (len(l[1]) == 3):
                    l[1] = '2' + l[1]
                nor = l[1] + '-' + l[0]
            else:
                if (len(l[2]) == 2):
                    l[2] = '20' + l[2]
                elif (len(l[2]) == 3):
                    l[2] = '2' + l[2]
                nor = l[2] + '-' + l[1] + '-' + l[0]
        elif (re.match('\d{1,2}((st)|(nd)|(rd)|(th)) of (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}', org)):
            org = org.replace('January', '01')
            org = org.replace('Feburary', '02')
            org = org.replace('March', '03')
            org = org.replace('April', '04')
            org = org.replace('May', '05')
            org = org.replace('June', '06')
            org = org.replace('July', '07')
            org = org.replace('August', '08')
            org = org.replace('September', '09')
            org = org.replace('October', '10')
            org = org.replace('November', '11')
            org = org.replace('December', '12')
            l = re.split(' ', org)
            nor = l[3] + '-' + l[2] + '-' + l[0][:-2]
        elif (re.match('(\d{1,2}|)( |)(January|February|March|April|May|June|July|August|September|October|November|December) \d{4}', org)):
            if (re.match('\d', org[0]) and re.match('\d', org[1]) == None):
                org = '0' + org
            org = org.replace('January', '01')
            org = org.replace('Feburary', '02')
            org = org.replace('March', '03')
            org = org.replace('April', '04')
            org = org.replace('May', '05')
            org = org.replace('June', '06')
            org = org.replace('July', '07')
            org = org.replace('August', '08')
            org = org.replace('September', '09')
            org = org.replace('October', '10')
            org = org.replace('November', '11')
            org = org.replace('December', '12')
            org = org.replace(' ', '')
            if (len(org) == 6):
                nor = org[2:] + '-' + org[0:2]
            else:    
                nor = org[4:] + '-' + org[2:4] + '-' + org[0:2]
    elif (time_type == 'TIME'):
        if (re.match('(\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}(  | |)|)(at|)( |)\d{1,2}(:|\.)\d{2}(AM|am|PM|pm|Hr|Hrs|hr|hrs|)( on the \d{1,2}((st)|(nd)|(rd)|(th)) of (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}|)', org)):
            tmp = org
            pm = 0
            am = 0
            if (re.search('PM', org, flags=0) != None):
                pm = 1
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('AM', org, flags=0) != None):
                am = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            get_date = 0
            date = re.search('\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org, flags=0)
            if (date != None):
                date = date.group(0)
                org = org.replace(date, '')
                date = re.split('\/|\.', date)
                if (len(date[0]) == 1):
                    date[0] = '0' + date[0]
                if (len(date[1]) == 1):
                    date[1] = '0' + date[1]
                if (len(date[2]) == 2):
                    date[2] = '20' + date[2]
                elif (len(date[2]) == 3):
                    date[2] = '2' + date[2]
                nor = date[2] + '-' + date[1] + '-' + date[0]
                get_date = 1
            yyyy = re.search('\d{4}', org, flags=0)
            if (yyyy != None and get_date == 0):
                yyyy = yyyy.group(0)
                org = org.replace(yyyy, '')
                nor = yyyy + '-'
            mm = re.search('January|February|March|April|May|June|July|August|September|October|November|December', org, flags=0)
            if (mm != None and get_date == 0):
                mm = mm.group(0)
                org = org.replace(mm, '')
                mm = mm.replace('January', '01')
                mm = mm.replace('Feburary', '02')
                mm = mm.replace('March', '03')
                mm = mm.replace('April', '04')
                mm = mm.replace('May', '05')
                mm = mm.replace('June', '06')
                mm = mm.replace('July', '07')
                mm = mm.replace('August', '08')
                mm = mm.replace('September', '09')
                mm = mm.replace('October', '10')
                mm = mm.replace('November', '11')
                mm = mm.replace('December', '12')
                nor = nor + mm + '-'
            dd = re.search('\d{1,2}((st)|(nd)|(rd)|(th))', org, flags=0)
            if (dd != None and get_date == 0):
                dd = dd.group(0)
                org = org.replace(dd, '')
                dd = dd.replace('st', '')
                dd = dd.replace('nd', '')
                dd = dd.replace('rd', '')
                dd = dd.replace('th', '')
                if (len(dd) == 1):
                    dd = '0' + dd
                nor = nor + dd
            get_time = 0
            time = re.search('\d{1,2}(:|\.)\d{1,2}', org, flags=0)
            if (time != None):
                time = time.group(0)
                org = org.replace(time, '')
                time = re.split('\.|:', time)
                if (pm == 1 and int(time[0]) < 12):
                    time[0] = str(int(time[0]) + 12)
                elif (am == 1 and int(time[0]) == 12):
                    time[0] = '00'
                if (len(time[0]) == 1):
                    time[0] = '0' + time[0]
                nor = nor + 'T' + time[0] + ':' + time[1]
                get_time = 1
            pm = 0
            am = 0
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            time = re.search('\d{1,4}', org, flags=0)
            if (time != None and get_time == 0):
                time = time.group(0)
                org = org.replace(time, '')
                hh, mm = '00', '00'
                if (len(time) == 4):
                    hh = time[0:2]
                    mm = time[2:]
                elif (len(time) == 3):
                    hh = time[0]
                    mm = time[1:]
                elif (len(time) == 2):
                    hh = time
                elif (len(time) == 1):
                    hh = time
                if (pm == 1 and int(hh) < 12):
                    hh = str(int(hh) + 12)
                elif (am == 1 and int(hh) == 12):
                    hh = '00'
                nor = nor + 'T' + hh + ':' + mm    
            #if (nor != ans):    
                #print(f'1:nor={nor}, ans={ans}, org={tmp}')
        elif (re.match('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', org)):
            tmp = org
            nor = org.replace(' ', 'T')
            #if (nor != ans):    
                #print(f'2:nor={nor}, ans={ans}, org={tmp}')
        elif (re.match('(at |)(\d{1,2}|)(:|\.|)\d{2}( |)(am|pm|Hr|Hrs|hr|hrs|)( on | )(the |)\d{1,2}(\/|\.)\d{2,4}(\/|\.)\d{1,2}', org)):
            tmp = org
            pm = 0
            am = 0
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            date = re.search('\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org, flags=0)
            if (date != None):
                date = date.group(0)
                org = org.replace(date, '')
                date = re.split('\/|\.', date)
                if (len(date[0]) == 1):
                    date[0] = '0' + date[0]
                if (len(date[1]) == 1):
                    date[1] = '0' + date[1]
                if (len(date[2]) == 2):
                    date[2] = '20' + date[2]
                elif (len(date[2]) == 3):
                    date[2] = '2' + date[2]
                nor = date[2] + '-' + date[1] + '-' + date[0] + 'T'
            org = org.replace(':', '')
            time = re.search('\d{1,4}', org, flags=0)
            if (time != None):
                time = time.group(0)
                org = org.replace(time, '')
                hh, mm = '00', '00'
                if (len(time) == 4):
                    hh = time[0:2]
                    mm = time[2:]
                elif (len(time) == 3):
                    hh = time[0]
                    mm = time[1:]
                elif (len(time) == 2):
                    hh = time
                elif (len(time) == 1):
                    hh = time
                if (pm == 1 and int(hh) < 12):
                    hh = str(int(hh) + 12)
                elif (am == 1 and int(hh) == 12):
                    hh = '00'
                nor = nor + hh + ':' + mm
            #if (nor != ans):    
                #print(f'3:nor={nor}, ans={ans}, org={tmp}')
        elif (re.match('((\d{1,2}((pm)|(am)))|(\d{4}(Hr|Hrs|hr|hrs|)))(( on )| )\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org)):
            tmp = org
            pm = 0
            am = 0
            if (re.search('pm', org, flags=0) != None):
                pm = 1
            if (re.search('am', org, flags=0) != None):
                am = 1
            date = re.search('\d{1,2}(\/|\.)\d{1,2}(\/|\.)\d{2,4}', org, flags=0)
            if (date != None):
                date = date.group(0)
                org = org.replace(date, '')
                date = re.split('\/|\.', date)
                if (len(date[0]) == 1):
                    date[0] = '0' + date[0]
                if (len(date[1]) == 1):
                    date[1] = '0' + date[1]
                if (len(date[2]) == 2):
                    date[2] = '20' + date[2]
                elif (len(date[2]) == 3):
                    date[2] = '2' + date[2]
                nor = date[2] + '-' + date[1] + '-' + date[0] + 'T'
            hrtime = re.search('\d{4}', org, flags=0)
            if (hrtime != None):
                hrtime = hrtime.group(0)
                org = org.replace(hrtime, '')
                nor = nor + hrtime[0:2] + ':' + hrtime[2:]
            time = re.search('\d{1,2}', org, flags=0)
            if (time != None):
                time = time.group(0)
                org = org.replace(time, '')
                hh = time
                if (pm == 1 and int(hh) < 12):
                    hh = str(int(hh) + 12)
                elif (am == 1 and int(hh) == 12):
                    hh = '00'
                if (len(hh) == 1):
                    hh = '0' + hh
                nor = nor + hh + ':' + '00'
            #if (nor != ans):    
                #print(f'4:nor={nor}, ans={ans}, org={tmp}')
    elif (time_type == 'DURATION'):   
        tmp = org
        org = org.replace('one', '1')
        org = org.replace('two', '2')
        org = org.replace('three', '3')
        org = org.replace('four', '4')
        org = org.replace('five', '5')
        num = ''
        alp = ''
        space_idx = org.find(' ')
        for i in range(len(org)):
            if (org[i] == 'D' or org[i] == 'd' or\
                org[i] == 'W' or org[i] == 'w' or\
                org[i] == 'M' or org[i] == 'm' or\
                org[i] == 'Y' or org[i] == 'y') and i > space_idx:
                alp = org[i]
                org = org[:i]
                break
        # print(org, alp)
        org = re.split('-| ', org)
        try:
            if org[0].isalpha():
                org[0] = w2n.word_to_num(org[0])
            # print(org)
            if (len(org) == 1 or org[1] == ''):
                nor = 'P' + str(org[0]) + alp.upper()
            else:
                nor = 'P' + str((int(org[0]) + int(org[1])) / 2) + alp.upper()
        except:
            nor = tmp
        # if (nor != ans):    
        #     print(f'dur:nor={nor}, ans={ans}, org={tmp}')
    elif (time_type == 'SET'):
        if (re.match('twice', org)):
            nor = 'R2'
    return nor


In [279]:
import pandas as pd

In [340]:
test_ans_df = pd.DataFrame({
    'file_id': [],
    'PHI_type': [],
    'PHI_start': [],
    'PHI_end': [],
    'PHI_content': [],
    'ISO': []
})

In [535]:
test_ans_df = pd.DataFrame({
    'file_id': [],
    'PHI_type': [],
    'PHI_start': [],
    'PHI_end': [],
    'PHI_content': [],
    'ISO': []
})

last_fid = ""
last_idx_of_last_seg = 0
for fid_sid, entities in result_ans_dict.items():
    curr_fid = fid_sid.split('_')[0]
    curr_sid = fid_sid.split('_')[1]
    # print(fid_sid)

    if curr_fid != last_fid:
        with open(os.path.join('./opendid_test/opendid_test', curr_fid+'.txt'), 'r') as file:
            content = file.read()
        last_fid = curr_fid
        last_idx_of_last_seg = 0

    # last_idx_of_last_seg = 0

    for i, entity in enumerate(entities):
        new_row = []
        # print(i, entity)

        if i == len(entities) - 1 and entity['entity_group'] == 'OTHER':
            last_idx_of_last_seg += len(result_segments[fid_sid])
            continue
        elif entity['entity_group'] != 'OTHER':
            # print(fid_sid)
            # print(last_idx_of_last_seg)
            start_idx = entity['start'] + last_idx_of_last_seg + int(curr_sid) - 1
            end_idx = entity['end'] + last_idx_of_last_seg + int(curr_sid) - 1
            # print('start', start_idx)
            # print(entity['word'])
            # find_idx = content.lower()[start_idx:].find(entity['word']) + start_idx
            # print('find', find_idx)
            # end_idx = find_idx+len(entity['word'])
            word = content[start_idx:end_idx]

            if i == len(entities) - 1:
                last_idx_of_last_seg += len(result_segments[fid_sid])

            if len(word) > 1:
                # print(word, start_idx, end_idx)
                while word[0].isalnum() == False or word[-1].isalnum() == False:
                    if word[0].isalnum() == False:
                        print('first case', fid_sid, word)
                        word = word[1:]
                        start_idx += 1
                    # print(word, start_idx, end_idx)
                    if word[-1].isalnum() == False:
                        word = word[:-1]
                        end_idx -= 1
            
            if '\n' in word:
                word = word.replace('\n', ' ')

            label = entity['entity_group']

            if entity['entity_group'] == 'DURATION' and word != 'twice':
                last_index = start_idx - 1
                # case 1: no spcae between number and month, week or year
                if content[last_index].isdigit():
                    while content[last_index].isdigit():
                        last_index -= 1
                    start_idx = last_index + 1
                    word = content[start_idx:end_idx]
                else:
                    last_space1 = content.rfind(' ', 0, start_idx)
                    last_space2 = content.rfind(' ', 0, last_space1)
                    start_idx = last_space2 + 1
                    word = content[start_idx:end_idx]
            elif word == 'twice':
                label = 'SET'
            
            new_row.extend([curr_fid, label, start_idx, end_idx, word])
            
            # print(curr_fid, label, start_idx, end_idx, word)

            need_iso = ['DATE', 'TIME', 'DURATION', 'SET']
            # if entity['entity_group'] == 'DATE':
            #     new_row.append(date2iso(word))
            # elif entity['entity_group'] == 'TIME':
            #     new_row.append(time2iso(word))
            # elif entity['entity_group'] == 'DURATION':
            #     new_row.append(duration2iso(word))
            # elif entity['entity_group'] == 'SET':
            #     new_row.append(set2iso(word))
            # else:
            #     new_row.append('')

            if label in need_iso:
                new_row.append(Normalize(label, word))
            else:
                new_row.append('')
            
            test_ans_df.loc[len(test_ans_df)] = new_row
        else:
            continue


In [616]:
test_ans_df2 = pd.DataFrame({
    'file_id': [],
    'PHI_type': [],
    'PHI_start': [],
    'PHI_end': [],
    'PHI_content': [],
    'ISO': []
})

last_fid = ""
last_idx_of_last_seg = 0
for fid_sid, entities in result_ans_dict.items():
    curr_fid = fid_sid.split('_')[0]
    curr_sid = fid_sid.split('_')[1]
    # print(fid_sid)

    if curr_fid != last_fid:
        with open(os.path.join('./opendid_test/opendid_test', curr_fid+'.txt'), 'r') as file:
            content = file.read()
        last_fid = curr_fid
        last_idx_of_last_seg = 0

    # last_idx_of_last_seg = 0

    for i, entity in enumerate(entities):
        new_row = []
        # print(i, entity)

        if i == len(entities) - 1 and entity['entity_group'] == 'OTHER':
            last_idx_of_last_seg += len(result_segments[fid_sid])
            continue
        elif entity['entity_group'] != 'OTHER':
            # print(fid_sid)
            # print(last_idx_of_last_seg)
            start_idx = entity['start'] + last_idx_of_last_seg + int(curr_sid) - 1
            end_idx = entity['end'] + last_idx_of_last_seg + int(curr_sid) - 1
            # print('start', start_idx)
            # print(entity['word'])
            # find_idx = content.lower()[start_idx:].find(entity['word']) + start_idx
            # print('find', find_idx)
            # end_idx = find_idx+len(entity['word'])
            word = content[start_idx:end_idx]

            if i == len(entities) - 1:
                last_idx_of_last_seg += len(result_segments[fid_sid])

            if len(word) == 1 and word.isalnum() == False: 
                continue
            if len(word) > 1:
                # print(word, start_idx, end_idx)
                while word[0].isalnum() == False or word[-1].isalnum() == False:
                    if word[0].isalnum() == False:
                        word = word[1:]
                        start_idx += 1
                    # print(word, start_idx, end_idx)
                    if word[-1].isalnum() == False and word[-1] == ')': 
                        break
                    elif word[-1].isalnum() == False: # in case it is a newline character
                        word = word[:-1]
                        end_idx -= 1
            
            if '\n' in word:
                word = word.replace('\n', ' ') # in case contains newline character

            label = entity['entity_group']

            if entity['entity_group'] == 'DURATION' and word != 'twice':
                last_index = start_idx - 1
                # case 1: no spcae between number and month, week or year
                if content[last_index].isdigit():
                    while content[last_index].isdigit():
                        last_index -= 1
                    start_idx = last_index + 1
                    word = content[start_idx:end_idx]
                else:
                    last_space1 = content.rfind(' ', 0, start_idx)
                    last_space2 = content.rfind(' ', 0, last_space1)
                    start_idx = last_space2 + 1
                    word = content[start_idx:end_idx]
            elif word == 'twice':
                label = 'SET'
            
            new_row.extend([curr_fid, label, start_idx, end_idx, word])
            
            # print(curr_fid, label, start_idx, end_idx, word)

            need_iso = ['DATE', 'TIME', 'DURATION', 'SET']
            # if entity['entity_group'] == 'DATE':
            #     new_row.append(date2iso(word))
            # elif entity['entity_group'] == 'TIME':
            #     new_row.append(time2iso(word))
            # elif entity['entity_group'] == 'DURATION':
            #     new_row.append(duration2iso(word))
            # elif entity['entity_group'] == 'SET':
            #     new_row.append(set2iso(word))
            # else:
            #     new_row.append('')

            if label in need_iso:
                new_row.append(Normalize(label, word))
            else:
                new_row.append('')
            
            test_ans_df2.loc[len(test_ans_df2)] = new_row
        else:
            continue


In [625]:
test_ans_df2.loc[test_ans_df2['PHI_type'] == 'TIME']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
17,1135,TIME,152,171,07/08/2062 at 12:20,2062-08-07T12:20
146,1393,TIME,159,178,13/05/2063 at 15:54,2063-05-13T15:54
171,1444,TIME,160,162,25,
173,1444,TIME,171,179,at 09:02,T09:02
218,1535,TIME,155,174,29/08/2063 at 09:28,2063-08-29T09:28
...,...,...,...,...,...,...
12155,file56634,TIME,720,726,4:04PM,T16:04
12689,file61638,TIME,640,653,28.08.69 12pm,2069-08-28T12:00
13048,file65156,TIME,832,834,12,
13049,file65156,TIME,835,840,45 pm,


In [621]:
test_ans_df[test_ans_df['PHI_content'].str.contains('\)')]

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
7877,5353,MEDICALRECORD,21,22,),
12855,file62979,MEDICALRECORD,186,187,),


In [626]:
test_ans_df.loc[test_ans_df['PHI_type'] == 'TIME']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
17,1135,TIME,152,171,07/08/2062 at 12:20,2062-08-07T12:20
146,1393,TIME,159,178,13/05/2063 at 15:54,2063-05-13T15:54
171,1444,TIME,160,162,25,
173,1444,TIME,171,179,at 09:02,T09:02
218,1535,TIME,155,174,29/08/2063 at 09:28,2063-08-29T09:28
...,...,...,...,...,...,...
12173,file56634,TIME,720,726,4:04PM,T16:04
12708,file61638,TIME,640,653,28.08.69 12pm,2069-08-28T12:00
13068,file65156,TIME,832,834,12,
13069,file65156,TIME,835,840,45 pm,


In [358]:
df_crf = pd.read_csv('./opendid_test/crf_answer.csv')

In [302]:
df_crf_dur = df_crf.loc[df_crf['PHI_type'] == 'DURATION']
df_crf_set = df_crf.loc[df_crf['PHI_type'] == 'SET']
df_crf_lo = df_crf.loc[df_crf['PHI_type'] == 'LOCATION-OTHER'] 

In [379]:
df_crf_lo

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
3401,2465,LOCATION-OTHER,68,79,PO BOX 1061,
5279,5223,LOCATION-OTHER,78,85,UNIT 22,


In [404]:
test_ans_df.loc[test_ans_df['file_id'] == '2465']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
4217,2465,IDNUM,13,23,38E756615P,
4218,2465,MEDICALRECORD,24,34,381756.QDO,
4219,2465,PATIENT,36,48,"DIMPFL, Neil",
4220,2465,IDNUM,59,67,38E75661,
4221,2465,STREET,68,70,PO,
4222,2465,STATE,71,74,BOX,
4223,2465,ZIP,78,79,1,
4224,2465,CITY,80,90,NARRANDERA,
4225,2465,STATE,92,102,Queensland,
4226,2465,ZIP,104,108,3236,


In [538]:
# New row data
crf_set_row_idx = 6905
new_row = df_crf.iloc[crf_set_row_idx]

# Determine the position to insert the new row
insert_position = 8619

# Create a new Series with the new row data
new_series = pd.Series(new_row)

# Shift down the rows below the insertion point
test_ans_df = pd.concat([test_ans_df.iloc[:insert_position], new_series.to_frame().transpose(), test_ans_df.iloc[insert_position:]]).reset_index(drop=True)


In [539]:
# New row data
crf_lo_row_idx = 3401
new_row = df_crf.iloc[crf_lo_row_idx]

# Determine the position to insert the new row
insert_position = 4221

# Create a new Series with the new row data
new_series = pd.Series(new_row)

# Shift down the rows below the insertion point
test_ans_df = pd.concat([test_ans_df.iloc[:insert_position], new_series.to_frame().transpose(), test_ans_df.iloc[insert_position:]]).reset_index(drop=True)

test_ans_df

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13220,file66968,DATE,1473,1481,28.05.69,2069-05-28
13221,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13222,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13223,file66968,DOCTOR,2297,2307,G Mccarter,


In [540]:
cpy_df = test_ans_df
cpy_df

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13220,file66968,DATE,1473,1481,28.05.69,2069-05-28
13221,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13222,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13223,file66968,DOCTOR,2297,2307,G Mccarter,


In [541]:
test_ans_df = cpy_df

In [542]:
rows_to_delete = [4222, 4223, 4224]

# Use the drop method to delete the specified rows
test_ans_df = test_ans_df.drop(rows_to_delete)

test_ans_df.loc[test_ans_df['file_id'] == '2465']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
4217,2465,IDNUM,13,23,38E756615P,
4218,2465,MEDICALRECORD,24,34,381756.QDO,
4219,2465,PATIENT,36,48,"DIMPFL, Neil",
4220,2465,IDNUM,59,67,38E75661,
4221,2465,LOCATION-OTHER,68,79,PO BOX 1061,
4225,2465,CITY,80,90,NARRANDERA,
4226,2465,STATE,92,102,Queensland,
4227,2465,ZIP,104,108,3236,
4228,2465,DATE,134,143,17/9/2008,2008-09-17
4229,2465,DATE,163,173,27/10/2065,2065-10-27


In [543]:
test_ans_df

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13220,file66968,DATE,1473,1481,28.05.69,2069-05-28
13221,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13222,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13223,file66968,DOCTOR,2297,2307,G Mccarter,


In [420]:
test_ans_df2.to_csv('./opendid_test/test_answer.csv')

In [421]:
test_ans_df2.to_csv('./opendid_test/test_answer.txt', sep='\t', header=False, index=False)

In [545]:
with open(os.path.join('./opendid_test/opendid_test', 'file38131.txt'), 'r') as file:
    content = file.read()
content[46:81]

"ST VINCENT'S HOSPITAL (DARLINGHURST"

In [484]:
test_ans_df.loc[test_ans_df['PHI_type'] == 'DATE']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
4,1097,DATE,60,68,7/9/2063,2063-09-07
5,1097,DATE,402,408,5.9.63,2063-09-05
16,1135,DATE,124,132,3/6/1989,1989-06-03
21,1135,DATE,1612,1618,8/8/62,2062-08-08
26,1153,DATE,50,59,25/3/2064,2064-03-25
...,...,...,...,...,...,...
13207,file66457,DATE,2032,2041,13/6/2070,2070-06-13
13220,file66968,DATE,1473,1481,28.05.69,2069-05-28
13221,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13222,file66968,DATE,2264,2272,5/7/2071,2071-07-05


In [591]:
with open('./opendid_test/test2.txt', 'r') as f:
    test2file = f.read()
test2file

"1097\tMEDICALRECORD\t1\t11\t433475.RDC\t\n1097\tPATIENT\t12\t26\tTimmins, ELDEN\t\n1097\tIDNUM\t27\t35\t43J47561\t\n1097\tIDNUM\t36\t44\t43J47561\t\n1097\tDATE\t60\t68\t7/9/2063\t2063-09-07\n1097\tDATE\t402\t408\t5.9.63\t2063-09-05\n1097\tDOCTOR\t438\t446\tL Bonnot\t\n1097\tIDNUM\t835\t842\t43J4756\t\n1135\tIDNUM\t13\t23\t23F340166Q\t\n1135\tMEDICALRECORD\t24\t35\t2323401.RRQ\t\n1135\tPATIENT\t37\t55\tJourdan, WILLIEMAE\t\n1135\tIDNUM\t66\t74\t23F34016\t\n1135\tSTREET\t75\t82\tRedacre\t\n1135\tCITY\t83\t88\tCOWRA\t\n1135\tSTATE\t90\t92\tWA\t\n1135\tZIP\t94\t98\t6021\t\n1135\tDATE\t124\t132\t3/6/1989\t1989-06-03\n1135\tTIME\t152\t171\t07/08/2062 at 12:20\t2062-08-07T12:20\n1135\tDEPARTMENT\t183\t198\tENVOI Pathology\t\n1135\tHOSPITAL\t199\t214\tTEMORA HOSPITAL\t\n1135\tDOCTOR\t218\t234\tANTHONY ROCKHOLD\t\n1135\tDATE\t1612\t1618\t8/8/62\t2062-08-08\n1135\tDOCTOR\t1649\t1656\tF Serpe\t\n1153\tMEDICALRECORD\t2\t13\t5804168.HSN\t\n1153\tPATIENT\t14\t25\tBOGEN, CARY\t\n1153\tIDNUM\t26\t34\

In [593]:
test2file.count('\n')

13331

## Adjustment

In [498]:
with open('./opendid_test/test.txt', 'r') as f:
    lines = f.readlines()

df_test = pd.DataFrame({
    "file_id": [],
    "PHI_type": [],
    "PHI_start": [],
    "PHI_end": [],
    "PHI_content": [],
    "ISO": []
})

for line in lines:
    line = line.replace('\n', '')
    content = line.split('\t')

    if len(content) == 5: ## not time or date
        content.append(np.nan)
    df_test.loc[len(df_test.index)] = content

df_test

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13271,file66075,PATIENT,149,162,"ELLOUT, Cohen",
13272,file66075,HOSPITAL,185,236,MILLICENT AND DISTRICT HOSPITAL AND HEALTH SER...,
13273,file66075,DOCTOR,636,650,C. Bastianelli,
13274,file66075,DOCTOR,658,671,C. Hoisington,


In [500]:
df_test.loc[df_test['file_id'] == 'file66968']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO


In [493]:
test_ans_df = test_ans_df.reset_index(drop=True)
test_ans_df

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13217,file66968,DATE,1473,1481,28.05.69,2069-05-28
13218,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13219,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13220,file66968,DOCTOR,2297,2307,G Mccarter,


In [501]:
test_ans_df.loc[test_ans_df['file_id'] == 'file66133']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
13158,file66133,IDNUM,8,18,42J497705F,
13159,file66133,MEDICALRECORD,27,33,423497,
13160,file66133,HOSPITAL,45,69,WILCANNIA HEALTH SERVICE,
13161,file66133,DATE,151,160,25/6/2071,2071-06-25
13162,file66133,IDNUM,161,169,42J49770,
13163,file66133,IDNUM,171,181,42J497705F,
13164,file66133,PATIENT,183,197,"Paton, DEREK M",
13165,file66133,HOSPITAL,220,238,ST GEORGE HOSPITAL,
13166,file66133,DOCTOR,2349,2351,LV,
13167,file66133,DATE,3670,3679,27/5/2071,2071-05-27


In [502]:
row_idx_f66133 = 13158
df_after_66133 = test_ans_df.iloc[13158:]
df_after_66133

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
13158,file66133,IDNUM,8,18,42J497705F,
13159,file66133,MEDICALRECORD,27,33,423497,
13160,file66133,HOSPITAL,45,69,WILCANNIA HEALTH SERVICE,
13161,file66133,DATE,151,160,25/6/2071,2071-06-25
13162,file66133,IDNUM,161,169,42J49770,
...,...,...,...,...,...,...
13217,file66968,DATE,1473,1481,28.05.69,2069-05-28
13218,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13219,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13220,file66968,DOCTOR,2297,2307,G Mccarter,


In [566]:
df_test2 = pd.concat([df_test, df_after_66133], ignore_index=True)
df_test2

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13335,file66968,DATE,1473,1481,28.05.69,2069-05-28
13336,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13337,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13338,file66968,DOCTOR,2297,2307,G Mccarter,


In [567]:
rows_with_par = list(df_test2.loc[df_test2['PHI_content'] == '('].index)


In [574]:
rows_with_par

df_test2 = df_test2.drop(rows_with_par)

In [615]:
df_test2.loc[df_test2['PHI_type'] == 'LOCATION-OTHER']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
4240,2465,LOCATION-OTHER,68,79,PO BOX 1061,


In [576]:
df_test2[df_test2['PHI_content'].str.contains('\(')]

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
411,1975,HOSPITAL,205,227,MAITLAND HOSPITAL (NSW,
1579,2262,HOSPITAL,213,248,ST VINCENT'S HOSPITAL (DARLINGHURST,
6148,5189,HOSPITAL,5827,5854,ALPINE HEALTH (MOUNT BEAUTY,
6347,5204,HOSPITAL,200,222,MAITLAND HOSPITAL (NSW,
8536,file31319,HOSPITAL,46,81,ST VINCENT'S HOSPITAL (DARLINGHURST,
9630,file38131,HOSPITAL,46,81,ST VINCENT'S HOSPITAL (DARLINGHURST,
10362,file43192,HOSPITAL,46,67,MAITLAND HOSPITAL (SA,
10506,file44277,HOSPITAL,46,67,MAITLAND HOSPITAL (SA,
10589,file44766,HOSPITAL,8240,8285,CRAIGIEBURN HEALTH SERVICE (COLAC AREA HEALTH,
10915,file46641,HOSPITAL,3042,3069,ALPINE HEALTH (MOUNT BEAUTY,


In [577]:
rows_to_add_par = list(df_test2[df_test2['PHI_content'].str.contains('\(')].index)
rows_to_add_par

[411,
 1579,
 6148,
 6347,
 8536,
 9630,
 10362,
 10506,
 10589,
 10915,
 11037,
 11507,
 11534,
 11811,
 12051,
 12827,
 12906,
 13314]

In [578]:
df_test3 = df_test2
for r_idx in rows_to_add_par:
    org_end = df_test3.loc[r_idx, 'PHI_end']
    org_content = df_test3.loc[r_idx, 'PHI_content']
    df_test3.loc[r_idx, 'PHI_end'] = str(int(org_end) + 1)
    df_test3.loc[r_idx, 'PHI_content'] = org_content + ')'

In [585]:
df_test2[df_test2['PHI_content'].str.contains('\(')]

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
411,1975,HOSPITAL,205,228,MAITLAND HOSPITAL (NSW),
1579,2262,HOSPITAL,213,249,ST VINCENT'S HOSPITAL (DARLINGHURST),
6148,5189,HOSPITAL,5827,5855,ALPINE HEALTH (MOUNT BEAUTY),
6347,5204,HOSPITAL,200,223,MAITLAND HOSPITAL (NSW),
8536,file31319,HOSPITAL,46,82,ST VINCENT'S HOSPITAL (DARLINGHURST),
9630,file38131,HOSPITAL,46,82,ST VINCENT'S HOSPITAL (DARLINGHURST),
10362,file43192,HOSPITAL,46,68,MAITLAND HOSPITAL (SA),
10506,file44277,HOSPITAL,46,68,MAITLAND HOSPITAL (SA),
10589,file44766,HOSPITAL,8240,8286,CRAIGIEBURN HEALTH SERVICE (COLAC AREA HEALTH),
10915,file46641,HOSPITAL,3042,3070,ALPINE HEALTH (MOUNT BEAUTY),


In [586]:
df_test2[df_test2['PHI_content'].str.contains('\(')]

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
411,1975,HOSPITAL,205,228,MAITLAND HOSPITAL (NSW),
1579,2262,HOSPITAL,213,249,ST VINCENT'S HOSPITAL (DARLINGHURST),
6148,5189,HOSPITAL,5827,5855,ALPINE HEALTH (MOUNT BEAUTY),
6347,5204,HOSPITAL,200,223,MAITLAND HOSPITAL (NSW),
8536,file31319,HOSPITAL,46,82,ST VINCENT'S HOSPITAL (DARLINGHURST),
9630,file38131,HOSPITAL,46,82,ST VINCENT'S HOSPITAL (DARLINGHURST),
10362,file43192,HOSPITAL,46,68,MAITLAND HOSPITAL (SA),
10506,file44277,HOSPITAL,46,68,MAITLAND HOSPITAL (SA),
10589,file44766,HOSPITAL,8240,8286,CRAIGIEBURN HEALTH SERVICE (COLAC AREA HEALTH),
10915,file46641,HOSPITAL,3042,3070,ALPINE HEALTH (MOUNT BEAUTY),


In [587]:
df_test2.to_csv('./opendid_test/test2.txt', sep='\t', header=False, index=False)

In [565]:
with open(os.path.join('./opendid_test/opendid_test', 'file44277.txt'), 'r') as file:
    content = file.read()
content[46:68]

'MAITLAND HOSPITAL (SA)'

In [589]:
test_ans_df2

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13200,file66968,DATE,1473,1481,28.05.69,2069-05-28
13201,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13202,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13203,file66968,DOCTOR,2297,2307,G Mccarter,


In [594]:
df_test2

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
0,1097,MEDICALRECORD,1,11,433475.RDC,
1,1097,PATIENT,12,26,"Timmins, ELDEN",
2,1097,IDNUM,27,35,43J47561,
3,1097,IDNUM,36,44,43J47561,
4,1097,DATE,60,68,7/9/2063,2063-09-07
...,...,...,...,...,...,...
13335,file66968,DATE,1473,1481,28.05.69,2069-05-28
13336,file66968,DATE,2184,2193,30/6/2069,2069-06-30
13337,file66968,DATE,2264,2272,5/7/2071,2071-07-05
13338,file66968,DOCTOR,2297,2307,G Mccarter,


In [595]:
df_test2[df_test2['file_id'].str.contains('HSA')]

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
8353,HSA0240,MEDICALRECORD,2,12,639803.VAH,
8354,HSA0240,PATIENT,14,29,"Flegler, Beulah",
8355,HSA0240,IDNUM,31,39,63Y80380,
8356,HSA0240,DOCTOR,760,770,ART Wheary,
8357,HSA0240,DATE,1850,1858,12/12/62,2062-12-12
...,...,...,...,...,...,...
8415,HSA0590,HOSPITAL,4155,4172,COOKTOWN HOSPITAL,
8416,HSA0590,DOCTOR,6203,6215,MATHEW UPTON,
8417,HSA0590,DOCTOR,6223,6237,MIQUEL FOSDICK,
8418,HSA0590,DOCTOR,7059,7076,ELLSWORTH Vigario,


In [596]:
df_test_cpy = df_test2.copy()

df_hsa = df_test_cpy[df_test_cpy['file_id'].str.contains('HSA')]
rows_hsa = list(df_test_cpy[df_test_cpy['file_id'].str.contains('HSA')].index)

df_test_cpy = df_test_cpy.drop(rows_hsa)

df_test_cpy = pd.concat([df_test_cpy, df_hsa], ignore_index=True)

In [600]:
df_test_cpy[df_test_cpy['file_id'].str.contains('HSA')]

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
13264,HSA0240,MEDICALRECORD,2,12,639803.VAH,
13265,HSA0240,PATIENT,14,29,"Flegler, Beulah",
13266,HSA0240,IDNUM,31,39,63Y80380,
13267,HSA0240,DOCTOR,760,770,ART Wheary,
13268,HSA0240,DATE,1850,1858,12/12/62,2062-12-12
...,...,...,...,...,...,...
13326,HSA0590,HOSPITAL,4155,4172,COOKTOWN HOSPITAL,
13327,HSA0590,DOCTOR,6203,6215,MATHEW UPTON,
13328,HSA0590,DOCTOR,6223,6237,MIQUEL FOSDICK,
13329,HSA0590,DOCTOR,7059,7076,ELLSWORTH Vigario,


In [601]:
df_test_cpy.to_csv('./opendid_test/test3.txt', sep='\t', header=False, index=False)

In [602]:
df_test_cpy[df_test_cpy['PHI_content']== '']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
2184,2313,DATE,2264,2264,,


In [603]:
df_test[df_test['PHI_content']== '']

Unnamed: 0,file_id,PHI_type,PHI_start,PHI_end,PHI_content,ISO
2184,2313,DATE,2264,2264,,


In [614]:
with open(os.path.join('./opendid_test/opendid_test', '2313.txt'), 'r') as file:
    f_2313 = file.read()
f_2313[2264]

'5'