In [5]:
# library
import os
import json
import random

In [7]:
seed = 61

random.seed(seed)

### load data

In [8]:
# data structure:
# First_Phase_Release(Correction)/First_Phase_Text_Dataset/
# First_Phase_Release(Correction)/answer.txt
# Second_Phase_Dataset/Second_Phase_Text_Dataset/
# Second_Phase_Dataset/answer.txt
# validation_dataset/Validation_Release/
# validation_dataset/answer.txt

first_dataset_doc_path = "./dataset/First_Phase_Release(Correction)/First_Phase_Text_Dataset/"
second_dataset_doc_path = "./dataset/Second_Phase_Dataset/Second_Phase_Text_Dataset/"
label_path = ["./dataset/First_Phase_Release(Correction)/answer.txt", "./dataset/Second_Phase_Dataset/answer.txt"]
val_dataset_doc_parh = "./dataset/validation_dataset/Validation_Release/"
val_label_path = "./dataset/validation_dataset/answer.txt"

first_dataset_path = [first_dataset_doc_path + file_path for file_path in os.listdir(first_dataset_doc_path)]
second_dataset_path = [second_dataset_doc_path + file_path for file_path in os.listdir(second_dataset_doc_path)]
train_path = first_dataset_path + second_dataset_path
val_path = [val_dataset_doc_parh + file_path for file_path in os.listdir(val_dataset_doc_parh)]

#check number of data-path
print(len(first_dataset_path)) #1120
print(len(second_dataset_path)) #614
print()
print(len(train_path)) #1734
print(len(val_path)) #560

1121
615

1736
561


In [9]:
# we can use utf-8-sig to solve ufeff problem (need to remove for label)
# Define function to read label

def create_label_dict(label_path):
    label_dict = {}  # y
    with open(label_path, "r", encoding="utf-8-sig") as f:
        file_text = f.read().strip()  

    # (id, label, start, end, query) or (id, label, start, end, query, time_org, timefix)
    for line in file_text.split("\n"):
        sample = line.split("\t")  
        sample[2], sample[3] = int(sample[2]), int(sample[3])

        if sample[0] not in label_dict:
            label_dict[sample[0]] = [sample[1:]]
        else:
            label_dict[sample[0]].append(sample[1:])

    return label_dict

train_label_dict = create_label_dict(label_path[0])
second_dataset_label_dict = create_label_dict(label_path[1])
train_label_dict.update(second_dataset_label_dict)
val_label_dict = create_label_dict(val_label_path)

In [10]:
# Define function to read data

def load_medical_records(paths):
    medical_record_dict = {}
    for data_path in paths:

        if os.path.isfile(data_path):
            file_id = data_path.split("/")[-1].split(".txt")[0]
            with open(data_path, "r", encoding="utf-8") as f:
                file_text = f.read()
                medical_record_dict[file_id] = file_text
    return medical_record_dict

train_medical_record_dict = load_medical_records(train_path)
val_medical_record_dict = load_medical_records(val_path)

In [11]:
#chect the number of data
print(len(list(train_medical_record_dict.keys()))) #1734
print(len(list(train_label_dict.keys()))) #1734
print(len(list(val_medical_record_dict.keys()))) #560
print(len(list(val_label_dict.keys()))) #560

1734
1734
560
560


In [12]:
all_medical_record_dict = {**train_medical_record_dict, **val_medical_record_dict}
all_label_dict = {**train_label_dict, **val_label_dict}

### clean data

In [13]:
# # input id (String type)
# #output the medical_record
# print(train_medical_record_dict["10"])

# # input id (String type)
# # output all labels from medical_record (list type)
# pp(train_label_dict["10"])

def check_labels(text, labels, record_id, tag=False):
    for i, label in enumerate(labels):  
        extracted_text = text[label[1]:label[2]]
        if extracted_text != label[3]:
            print(f"Error in ID {record_id}, Line {i}: {label[0]}, position: {label[1]}-{label[2]}, "
                  f"label: '{label[3]}', extracted: '{extracted_text}'")
        elif tag:
            print(f"Correct in ID {record_id}, Line {i}: {label[0]}, position: {label[1]}-{label[2]}, extracted: '{extracted_text}'")

def check_all_labels(medical_records, label_dict, tag=False):
    for record_id, text in medical_records.items():
        if record_id in label_dict:
            labels = label_dict[record_id]
            check_labels(text, labels, record_id, tag)
        else:
            print(f"ID: {record_id} has no label")

         

In [14]:
# check training data
check_all_labels(all_medical_record_dict, all_label_dict)   

Error in ID 1139, Line 16: HOSPITAL, position: 2702-2722, label: 'PLANTAGENET HOSPITAL', extracted: 'PLANTAGENE3/9 JENNIE'
Error in ID 1481, Line 21: DEPARTMENT, position: 2390-2403, label: 'SEALS Central', extracted: 'SEAKALBARRI H'
Error in ID file21297, Line 20: ORGANIZATION, position: 6045-6064, label: 'KB Home Los Angeles', extracted: 'KB Home	Los Angeles'


In [15]:
# check 1139, PLANTAGENET 3/9 JENNIE COX CLOSE Pathology ?
print(all_medical_record_dict['1139'][2702:2722])
print(all_label_dict['1139'][16])

# replace it
all_label_dict['1139'][16][3]=all_medical_record_dict['1139'][2702:2722]

PLANTAGENE3/9 JENNIE
['HOSPITAL', 2702, 2722, 'PLANTAGENET HOSPITAL']


In [16]:
# check 1481, there is no DEPARTMENT
print(all_medical_record_dict['1481'][2390:2403])
print(all_label_dict['1481'][21])

# remove it 
all_label_dict['1481'].pop(21)

SEAKALBARRI H
['DEPARTMENT', 2390, 2403, 'SEALS Central']


['DEPARTMENT', 2390, 2403, 'SEALS Central']

In [17]:
# check file21297, index 6047 is '\t'
all_medical_record_dict['file21297'][6045:6064]

# replace it
all_medical_record_dict['file21297'] = val_medical_record_dict['file21297'][:6047] + ' ' + val_medical_record_dict['file21297'][6048:]

In [18]:
all_keys = list(all_medical_record_dict.keys())
random.shuffle(all_keys)
train_size = int(0.8 * len(all_keys))
val_size = len(all_keys) - train_size

train_keys = all_keys[:train_size]
val_keys = all_keys[train_size:]

train_medical_record_dict = {key: all_medical_record_dict[key] for key in train_keys}
train_label_dict = {key: all_label_dict[key] for key in train_keys}

val_medical_record_dict = {key: all_medical_record_dict[key] for key in val_keys}
val_label_dict = {key: all_label_dict[key] for key in val_keys}

print("New Train Set Size:", len(train_medical_record_dict))
print("New Validation Set Size:", len(val_medical_record_dict))

New Train Set Size: 1835
New Validation Set Size: 459


### create label type table

In [19]:
#add special token [other] in label list
labels_type = list(set( [label[0] for labels in train_label_dict.values() for label in labels] ))
labels_type = ["OTHER"] + labels_type 
labels_num = len(labels_type)
# print(labels_type)
# print("The number of labels:", labels_num)
labels_type_table = {label_name:id for id, label_name in enumerate(labels_type)}
print(labels_type_table)

{'OTHER': 0, 'MEDICALRECORD': 1, 'LOCATION-OTHER': 2, 'DATE': 3, 'ZIP': 4, 'ORGANIZATION': 5, 'IDNUM': 6, 'HOSPITAL': 7, 'SET': 8, 'COUNTRY': 9, 'TIME': 10, 'PATIENT': 11, 'CITY': 12, 'PHONE': 13, 'ROOM': 14, 'AGE': 15, 'URL': 16, 'STREET': 17, 'DEPARTMENT': 18, 'DOCTOR': 19, 'DURATION': 20, 'STATE': 21}


In [20]:
# fix it
labels_type_table={'OTHER': 0, 'PATIENT': 1, 'DOCTOR': 2, 'CITY': 3, 'ROOM': 4, 'STREET': 5, 'MEDICALRECORD': 6, 'DEPARTMENT': 7, 'LOCATION-OTHER': 8, 'COUNTRY': 9, 'IDNUM': 10, 'STATE': 11, 'AGE': 12, 'SET': 13, 'HOSPITAL': 14, 'DATE': 15, 'ZIP': 16, 'URL': 17, 'DURATION': 18, 'ORGANIZATION': 19, 'TIME': 20, 'PHONE': 21}
print(labels_type_table)

{'OTHER': 0, 'PATIENT': 1, 'DOCTOR': 2, 'CITY': 3, 'ROOM': 4, 'STREET': 5, 'MEDICALRECORD': 6, 'DEPARTMENT': 7, 'LOCATION-OTHER': 8, 'COUNTRY': 9, 'IDNUM': 10, 'STATE': 11, 'AGE': 12, 'SET': 13, 'HOSPITAL': 14, 'DATE': 15, 'ZIP': 16, 'URL': 17, 'DURATION': 18, 'ORGANIZATION': 19, 'TIME': 20, 'PHONE': 21}


In [21]:
#check the label_type is enough for validation
val_labels_type = list(set( [label[0] for labels in val_label_dict.values() for label in labels] ))
for val_label_type in val_labels_type:
    if val_label_type not in labels_type:
        print("Special label in validation:", val_label_type)

In [22]:
# Function to count label distribution
def count_label_distribution(label_dict, labels_type_table):
    label_counts = {label: 0 for label in labels_type_table.keys()}
    for labels in label_dict.values():
        for label_info in labels:
            label = label_info[0]  # Extract label name
            if label in label_counts:
                label_counts[label] += 1
    return label_counts

In [23]:
# Calculate label distribution
train_label_distribution = count_label_distribution(train_label_dict, labels_type_table)
val_label_distribution = count_label_distribution(val_label_dict, labels_type_table)

# Print results
print("Train Label Distribution:")
for label, count in train_label_distribution.items():
    print(f"  {label}: {count}")

print("\nValidation Label Distribution:")
for label, count in val_label_distribution.items():
    print(f"  {label}: {count}")

Train Label Distribution:
  OTHER: 0
  PATIENT: 1885
  DOCTOR: 6987
  CITY: 1020
  ROOM: 1
  STREET: 980
  MEDICALRECORD: 1912
  DEPARTMENT: 1135
  LOCATION-OTHER: 7
  COUNTRY: 3
  IDNUM: 3918
  STATE: 952
  AGE: 146
  SET: 11
  HOSPITAL: 1915
  DATE: 5108
  ZIP: 994
  URL: 3
  DURATION: 28
  ORGANIZATION: 113
  TIME: 1256
  PHONE: 9

Validation Label Distribution:
  OTHER: 0
  PATIENT: 479
  DOCTOR: 1745
  CITY: 256
  ROOM: 0
  STREET: 240
  MEDICALRECORD: 475
  DEPARTMENT: 268
  LOCATION-OTHER: 3
  COUNTRY: 2
  IDNUM: 936
  STATE: 233
  AGE: 38
  SET: 3
  HOSPITAL: 478
  DATE: 1285
  ZIP: 244
  URL: 0
  DURATION: 6
  ORGANIZATION: 47
  TIME: 279
  PHONE: 2


## analysis

In [26]:
import json

def load_metrics_from_file(filename):
    with open(filename, 'r') as file:
        metrics = json.load(file)
    return metrics

filename = 'training_stat_longformerCRF.json'
training_stats = load_metrics_from_file(filename)

#print(training_stats)

In [27]:
def calculate_micro_metrics(confusion_matrix):
    micro_TP = sum(scores["TP"] for scores in confusion_matrix.values())
    micro_FP = sum(scores["FP"] for scores in confusion_matrix.values())
    micro_FN = sum(scores["FN"] for scores in confusion_matrix.values())

    micro_precision = micro_TP / (micro_TP + micro_FP) if micro_TP + micro_FP > 0 else 0
    micro_recall = micro_TP / (micro_TP + micro_FN) if micro_TP + micro_FN > 0 else 0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if micro_precision + micro_recall > 0 else 0

    return micro_precision, micro_recall, micro_f1

def calculate_macro_metrics(confusion_matrix):
    num_labels = len(confusion_matrix)-1   #remove 'other'
    macro_precision = macro_recall = macro_f1 = 0

    for scores in confusion_matrix.values():
        precision = scores["TP"] / (scores["TP"] + scores["FP"]) if scores["TP"] + scores["FP"] > 0 else 0
        recall = scores["TP"] / (scores["TP"] + scores["FN"]) if scores["TP"] + scores["FN"] > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        macro_precision += precision
        macro_recall += recall
        macro_f1 += f1

    macro_precision /= num_labels
    macro_recall /= num_labels
    macro_f1 /= num_labels

    return macro_precision, macro_recall, macro_f1

In [28]:

for epoch_data in training_stats:
    train_micro_precision, train_micro_recall, train_micro_f1 = calculate_micro_metrics(epoch_data['train_confusion_matrix'])
    val_micro_precision, val_micro_recall, val_micro_f1 = calculate_micro_metrics(epoch_data['val_confusion_matrix'])

    train_macro_precision, train_macro_recall, train_macro_f1 = calculate_macro_metrics(epoch_data['train_confusion_matrix'])
    val_macro_precision, val_macro_recall, val_macro_f1 = calculate_macro_metrics(epoch_data['val_confusion_matrix'])

    print(f"Epoch {epoch_data['epoch']}")
    print(f"Train Micro: Precision - {train_micro_precision}, Recall - {train_micro_recall}, F1 - {train_micro_f1}")
    print(f"Validation Micro: Precision - {val_micro_precision}, Recall - {val_micro_recall}, F1 - {val_micro_f1}")
    print(f"Train Macro: Precision - {train_macro_precision}, Recall - {train_macro_recall}, F1 - {train_macro_f1}")
    print(f"Validation Macro: Precision - {val_macro_precision}, Recall - {val_macro_recall}, F1 - {val_macro_f1}")
    print("-" * 50)

Epoch 0
Train Micro: Precision - 0.47317856260805136, Recall - 0.6751356684755797, F1 - 0.5563977464134287
Validation Micro: Precision - 0.94505805720759, Recall - 0.9515255203877958, F1 - 0.9482807615799943
Train Macro: Precision - 0.3266505303306142, Recall - 0.3525592287926214, F1 - 0.3250544173903867
Validation Macro: Precision - 0.5388141820706096, Recall - 0.5494074310339736, F1 - 0.5439460901939769
--------------------------------------------------
Epoch 1
Train Micro: Precision - 0.9462044231918709, Recall - 0.9483050250193812, F1 - 0.9472535595487425
Validation Micro: Precision - 0.9536031589338598, Recall - 0.9640718562874252, F1 - 0.9588089330024814
Train Macro: Precision - 0.599243673392151, Recall - 0.5525470210100747, F1 - 0.5511802778204754
Validation Macro: Precision - 0.6051050127789189, Recall - 0.5785441898095675, F1 - 0.5800316207793217
--------------------------------------------------
Epoch 2
Train Micro: Precision - 0.963189967945331, Recall - 0.9635633237014589,

In [29]:
#指定某個epoch，列出該epoch下各分類的狀況
#這是看訓練

def print_class_performance(epoch_data, epoch_number):
    if epoch_data['epoch'] == epoch_number:
        print(f"Performance for Epoch {epoch_number}:")
        confusion_matrix = epoch_data['train_confusion_matrix']  #要看驗證改這邊，把它改成val_confusion_matrix
        for label, scores in confusion_matrix.items():
            precision = scores["TP"] / (scores["TP"] + scores["FP"]) if scores["TP"] + scores["FP"] > 0 else 0
            recall = scores["TP"] / (scores["TP"] + scores["FN"]) if scores["TP"] + scores["FN"] > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

            print(f"Label: {label}")
            print(f"   Precision: {precision:.4f}")
            print(f"   Recall: {recall:.4f}")
            print(f"   F1 Score: {f1:.4f}")
            print("-" * 20)


epoch_number = 15 
for epoch_data in training_stats:
    print_class_performance(epoch_data, epoch_number)


Performance for Epoch 15:
Label: OTHER
   Precision: 0.0000
   Recall: 0.0000
   F1 Score: 0.0000
--------------------
Label: PATIENT
   Precision: 0.9905
   Recall: 0.9936
   F1 Score: 0.9921
--------------------
Label: DOCTOR
   Precision: 0.9871
   Recall: 0.9871
   F1 Score: 0.9871
--------------------
Label: CITY
   Precision: 0.9756
   Recall: 0.9804
   F1 Score: 0.9780
--------------------
Label: ROOM
   Precision: 1.0000
   Recall: 1.0000
   F1 Score: 1.0000
--------------------
Label: STREET
   Precision: 0.9980
   Recall: 0.9959
   F1 Score: 0.9969
--------------------
Label: MEDICALRECORD
   Precision: 0.9969
   Recall: 0.9974
   F1 Score: 0.9971
--------------------
Label: DEPARTMENT
   Precision: 0.9537
   Recall: 0.9612
   F1 Score: 0.9574
--------------------
Label: LOCATION-OTHER
   Precision: 0.8571
   Recall: 0.8571
   F1 Score: 0.8571
--------------------
Label: COUNTRY
   Precision: 0.0000
   Recall: 0.0000
   F1 Score: 0.0000
--------------------
Label: IDNUM
   Pr