## Data Preprocessing

In [1]:
import pandas as pd
import json
import pickle

import os
import re
import requests


In [2]:
files = {'train': './dataset/rand/train.csv', 
         'dev': './dataset/rand/dev.csv',
         'test': './dataset/rand/test.csv'}

# files = {'train': './dataset/stratified/train.csv', 
#          'dev': './dataset/stratified/dev.csv',
#          'test': './dataset/stratified/test.csv'}

replace_to = {'1)  Which infectious disease caused the outbreak?': 'disease',
              '2)  In which country is the outbreak taking place?': 'country',
              '3)  In which province is the outbreak taking place?': 'province',
              '4)  In which city/town is the outbreak taking place?': 'city',
              '5)  Check and fill country Geo Code (e.g. 1794299):': 'country Code',
              '6)  Check and fill province Geo Code (e.g. 1794299):': 'province Code',
              '7) Check and fill city Geo Code (e.g. 1815286):': 'city Code',
              '8)  Which virus or bacteria caused the outbreak?': 'virus',
              '9)  What symptoms were experienced by the infected victims?': 'symptoms',
              '11) What is the type of the victims?': 'victims',
              'Text': 'Text'}

# replace_to = {'1)  Which infectious disease caused the outbreak?': 'disease',
#               '2)  In which country is the outbreak taking place?': 'country',
#               '3)  In which province is the outbreak taking place?': 'province',
#               '4)  In which city/town is the outbreak taking place?': 'city',
#               '8)  Which virus or bacteria caused the outbreak?': 'virus',
#               '9)  What symptoms were experienced by the infected victims?': 'symptoms',
#               '11) What is the type of the victims?': 'victims',
#               'Text': 'Text'}



### load original data, filter and save to T5 generation format

In [48]:
def process_save(mode, finetune_dir):
    df = pd.read_csv(files[mode])
    df = df[list(replace_to.keys())].rename(columns=replace_to).fillna("[None]")
    values = df.values.tolist()
    keys = df.columns.tolist()

    print(keys)
    inputs = []
    targets = []
    events = []
    for sample_i in values:
        inputs.append("Extract: " + sample_i[-1])
        tar_i = []
        event_i = {}
        for r in range(len(keys[:-1])):
            tar_i_r = str(sample_i[r]).split("; ")
            new_tar_i_r = " [And] ".join(tar_i_r)
            new_tar_i_r.replace("  ", " ")
            tar_i.append("<|{0}|> {1} <|/{0}|>".format(keys[r], new_tar_i_r))
            event_i[keys[r]] = sample_i[r]
        targets.append(" ".join(tar_i))
        events.append(event_i)
    
    with open('{}/{}_input.json'.format(finetune_dir, mode), 'w') as f:
        json.dump(inputs, f, indent=4)

    with open('{}/{}_target.json'.format(finetune_dir, mode), 'w') as f:
        json.dump(targets, f, indent=4)

    with open('{}/{}_all.pkl'.format(finetune_dir, mode), 'wb') as f:
        pickle.dump({
            'input': inputs,
            'target': targets,
            'all': events
        }, f)

finetune_dir = "./dataset/finetuned_data/stratified_direct_allrole"
if not os.path.exists(finetune_dir):
    os.makedirs(finetune_dir)

for m in files.keys():
    process_save(m, finetune_dir)

['disease', 'country', 'province', 'city', 'country Code', 'province Code', 'city Code', 'virus', 'symptoms', 'victims', 'Text']
['disease', 'country', 'province', 'city', 'country Code', 'province Code', 'city Code', 'virus', 'symptoms', 'victims', 'Text']
['disease', 'country', 'province', 'city', 'country Code', 'province Code', 'city Code', 'virus', 'symptoms', 'victims', 'Text']


## For evaluation

### set up role_list and special_tokens

In [3]:
import re
import json
from copy import deepcopy

ROLE_LIST = ['disease', 'country', 'province', 'city', 'country Code', 'province Code', 'city Code', 'virus', 'symptoms', 'victims']
special_tokens = ['<|disease|>', '<|/disease|>', '<|country|>', '<|/country|>', '<|province|>', '<|/province|>', '<|city|>', '<|/city|>', 
            '<|country Code|>', '<|/country Code|>', '<|province Code|>', '<|/province Code|>', '<|city Code|>', '<|/city Code|>', '<|virus|>', 
            '<|/virus|>', '<|symptoms|>', '<|/symptoms|>', '<|victims|>', '<|/victims|>', '[None]', '[And]']

# ROLE_LIST = ['disease', 'country', 'province', 'city', 'virus', 'symptoms', 'victims']
# special_tokens = ['<|disease|>', '<|/disease|>', '<|country|>', '<|/country|>', '<|province|>', '<|/province|>', '<|city|>', '<|/city|>', 
#             '<|virus|>', '<|/virus|>', '<|symptoms|>', '<|/symptoms|>', '<|victims|>', '<|/victims|>', '[None]', '[And]']

extraction example

In [4]:
t1 = "<|disease|> anthrax <|/disease|> <|country|> United States <|/country|> <|province|> Texas <|/province|> <|city|> Crockett County [And] Kinney County [And] Sutton County [And] Uvalde County [And] Val Verde County <|/city|> <|country Code|> 6252001 <|/country Code|> <|province Code|> 4736286 <|/province Code|> <|city Code|> 4738723 [And] 5519699 [And] 5524590 [And] 5531955 [And] 5532889 <|/city Code|> <|virus|> Bacillus anthracis <|/virus|> <|symptoms|> [None] <|/symptoms|> <|victims|> Animal <|/victims|>"
t2 = "<|disease|> Salmonellosis <|/disease|> <|country|> United States <|/country|> <|province|> Minnesota  [And] Wisconsin  <|/province|> <|city|> [None] <|/city|> <|country Code|> 6252001 <|/country Code|> <|province Code|> 5037779 [And] 5279468 <|/province Code|> <|city Code|> [None] <|/city Code|> <|virus|> Salmonella <|/virus|> <|symptoms|> [None] <|/symptoms|> <|victims|> Human <|/victims|>"
def extract_outbreak(text):
    output = []
    tag_s = re.search('<\|[^/>][^>]*\|>', text)
    while tag_s:
        text = text[tag_s.end():]
        r_type = tag_s.group()[2:-2]
        if r_type in ROLE_LIST:
            tag_e = re.search(f'<\|/{r_type}\|>', text)
            if tag_e:
                arg = text[:tag_e.start()].strip()
                for a in arg.split(f' [And] '):
                    for aa in a.split('[and]'):
                        aa = aa.strip()
                        if aa != '' and a != "[None]":
                            output.append((aa.lower(), r_type))
                text = text[tag_e.end():]
        tag_s = re.search('<\|[^/>][^>]*\|>', text)
    return output

o1 = extract_outbreak(t1)
o2 = extract_outbreak(t2)

In [5]:
o1

[('anthrax', 'disease'),
 ('united states', 'country'),
 ('texas', 'province'),
 ('crockett county', 'city'),
 ('kinney county', 'city'),
 ('sutton county', 'city'),
 ('uvalde county', 'city'),
 ('val verde county', 'city'),
 ('6252001', 'country Code'),
 ('4736286', 'province Code'),
 ('4738723', 'city Code'),
 ('5519699', 'city Code'),
 ('5524590', 'city Code'),
 ('5531955', 'city Code'),
 ('5532889', 'city Code'),
 ('bacillus anthracis', 'virus'),
 ('animal', 'victims')]

### load results

load predicted results and construct database

In [6]:
all_aliases = {}

with open("dataset/synonym/city.json") as f:
    all_aliases["city"] = json.load(f)

with open("dataset/synonym/country.json") as f:
    all_aliases["country"] = json.load(f)

with open("dataset/synonym/disease.json") as f:
    all_aliases["disease"] = json.load(f)

with open("dataset/synonym/province.json") as f:
    all_aliases["province"] = json.load(f)

with open("dataset/synonym/virus.json") as f:
    all_aliases["virus"] = json.load(f)

with open("dataset/synonym/symptoms.json") as f:
    all_aliases["symptoms"] = json.load(f)

In [7]:
# make everything in small case
for k, i in all_aliases.items():
    for kk, ii in i.items():
        all_aliases[k][kk] = [iii.lower() for iii in ii]


In [8]:
mapped_aliases = {"country": {}, "city": {}, "province": {}, "virus": {}, "disease": {}, "symptoms": {}}
searched = []

# Replace 'your_username' with your GeoNames username

# test_results = json.load(open(""))
# test_results = json.load(open("./predictions/direct_T5/test.pred.json"))
# test_results = json.load(open("./output/direct_T5base_withoutcode/20230306_190639/pred.test.json"))
# test_results = json.load(open("./output/direct_bart_base_withoutcode/20230409_221130/pred.test.json"))
# test_results = json.load(open("./output/stratified_direct_T5base_withoutcode/20230404_215037/pred.test.json"))
# test_results = json.load(open("./output/stratified_direct_bart_base_withoutcode/20230409_221725/pred.test.json"))

# test_results = json.load(open("./output/direct_T5base_allrole/20230228_155419/pred.test.json"))
# test_results = json.load(open("./output/direct_bart_base_allrole/20230512_212611/pred.test.json"))
test_results = json.load(open("./predictions/evaluation_galactica_new.json"))

# map synonyms
for i in range(len(test_results)):
    gs = extract_outbreak(test_results[i]['true_event']) #gold text
    for idx, g_tuple in enumerate(gs):
        if g_tuple[1] in all_aliases.keys():
            value = g_tuple[0]
            if value not in mapped_aliases[g_tuple[1]]:
                # search for it
                keys_to_search = list(all_aliases[g_tuple[1]].keys())
                num_to_search = len(keys_to_search)
                k = 0
                while k < num_to_search:
                    if value in all_aliases[g_tuple[1]][keys_to_search[k]]:
                        mapped_aliases[g_tuple[1]][value] = all_aliases[g_tuple[1]][keys_to_search[k]]
                        break
                    k+=1

In [21]:
# test_results = json.load(open("./output/direct_T5base_allrole/20230228_155419/pred.test.json"))
# test_results = json.load(open("./output/direct_bart_base_allrole/20230512_212611/pred.test.json"))
test_results = json.load(open("./predictions/evaluation_opt_new.json"))

gold_test, gold_alias_test, pred_test, match_test = [], [], [], []
gold_text_count = 0
for i in range(len(test_results)):
    gs = extract_outbreak(test_results[i]['true_event']) #gold text
    gold_test.extend(gs)
    alters = []
    for idx, g_tuple in enumerate(gs):
        if g_tuple[1] in mapped_aliases.keys():
            value = g_tuple[0]
            if value in mapped_aliases[g_tuple[1]]:
                for alter in mapped_aliases[g_tuple[1]][value]:
                    alters.append((alter, g_tuple[1]))
    gs = gs + alters
    ps = extract_outbreak(test_results[i]['generated_event']) #pred text

    # ps = test_results[i]['pred attribute']
    # gs = test_results[i]['gold attribute']

    # this is attribute_all, entity pair level f1 score
    if ps != []:
        pred_test.extend(ps)
        copy_gs = deepcopy(gs)
        for p in ps:
            if p in copy_gs:
                copy_gs.remove(p)
                match_test.append(p)

    gold_alias_test.extend(gs)

### Compute F1

Overall F1

In [22]:
def safe_div(num, denom):
    if denom > 0:
        return num / denom
    else:
        return 0

def compute_f1(predicted, gold, matched):
    precision = safe_div(matched, predicted)
    recall = safe_div(matched, gold)
    f1 = safe_div(2 * precision * recall, precision + recall)
    return precision, recall, f1
    

print(len(pred_test), len(gold_test), len(match_test))
print("total F1: ", compute_f1(len(pred_test), len(gold_test), len(match_test)))

1447 1416 691
total F1:  (0.47753973738769867, 0.4879943502824859, 0.4827104435906392)


Individual F1

In [23]:
classes = ['disease', 'country', 'province', 'city', 'country Code', 'province Code', 'city Code', 'virus', 'symptoms', 'victims']

# Separate predicted and gold labels for each class
pred_labels = {c: [] for c in classes}
gold_labels = {c: [] for c in classes}
match_labels = {c: [] for c in classes}
for p in pred_test:
    if p[1] in classes:
        pred_labels[p[1]].append(p[0])
for g in gold_test:
    if g[1] in classes:
        gold_labels[g[1]].append(g[0])
for m in match_test:
    if m[1] in classes:
        match_labels[m[1]].append(m[0])


# Calculate F1 score for each class
F1s = {}
for c in classes:
    if gold_labels[c] and pred_labels[c]:
        report = compute_f1(len(pred_labels[c]), len(gold_labels[c]), len(match_labels[c]))
        F1s[c] = round(report[2]*100, 2)

print(F1s)
print(' & '.join([str(i) for i in list(F1s.values())]))


{'disease': 66.89, 'country': 82.51, 'province': 49.54, 'city': 43.09, 'country Code': 32.24, 'province Code': 0.62, 'city Code': 0, 'virus': 52.99, 'symptoms': 54.68, 'victims': 94.31}
66.89 & 82.51 & 49.54 & 43.09 & 32.24 & 0.62 & 0 & 52.99 & 54.68 & 94.31


In [16]:
## F1 score without predicting code
num_g, num_p, num_m = 0,0,0
for c in ['disease', 'country', 'province', 'city', 'virus', 'symptoms', 'victims']:
    num_g += len(gold_labels[c])
    num_p += len(pred_labels[c])
    num_m += len(match_labels[c])

report = compute_f1(num_p, num_g, num_m)
print(report[2])



0.6145363408521303


# Data Preprocessing for DE-PPN (ONE-IE is for sentence-level)？
Parallel Prediction for Document-level Event Extraction

In [116]:
files = {'train': './dataset/rand/train.csv', 
         'dev': './dataset/rand/dev.csv',
         'test': './dataset/rand/test.csv'}

import nltk
# nltk.download('punkt')


# For DE-PNN
# data format
# lists [doc_id, dict_keys(['sentences', 'ann_valid_mspans', 'ann_valid_dranges', 'ann_mspan2dranges', 'ann_mspan2guess_field', 'recguid_eventname_eventdict_list'])]
# sentences: list of sentences e.g. ["证券代码：300142证券简称：沃森生物公告编号：2016-072", "云南沃森生物技术股份有限公司关于股东解除股权质押的公告", ... ]
# mspan: list of entity text  e.g. ["300142", "沃森生物", ... ]
# dranges: list of span position [sentence_index, start_index, end_index] e.g. [[0, 5, 11], [0, 16, 20], ... ] span_text = sentences[sentence_index][start_index:end_index]
# mspan2dranges: dictionary map mspan text to drange e.g. {"300142": [0,5,11], "沃森生物": [0, 16,20], ...}
# mspan2guess_field: dictionary map mspan text to role-type e.g. {"300142": "StockCode", "沃森生物": "StockAbbr", ...}
# recguid_eventname_eventdict_list: list of events [event_1, event_2, ...], event_i: [id (start from 0), event_type (string), attributes ({role: text_span})]


# 'disease', 'country', 'province', 'city', 'virus', 'symptoms', 'victims', 'Text']
# span based method could not predict unseen word ..., check on Zihao's NER results
import re
import difflib


def find_word_exact(sentences, query_word):
    if query_word[-1] == " ":
        query_word = query_word[:-1]
    for sentence_index, sentence in enumerate(sentences):
        # pattern = r'\b' + re.escape(query_word) + r'\w*\b'
        pattern = r'\b' + re.escape(query_word) + r'\b'
        matches = list(re.finditer(pattern, sentence, re.IGNORECASE))
        result = []
        if matches:
            for match in matches:
                start, end = match.span()
                result.append([sentence_index, start, end])
            return result
    return None

def find_word_partial(sentences, query_words):
    if query_word[-1] == " ":
        query_word = query_word[:-1]
    for sentence_index, sentence in enumerate(sentences):
        for query_word in query_words.split(" "):
            pattern = r'\b' + re.escape(query_word) + r'\w*\b'
            matches = list(re.finditer(pattern, sentence, re.IGNORECASE))
            if matches:
                start, end = matches[0].span()
                return [sentence_index, start, end]
    return None

def find_word_sim(sentences, query_word, similarity_threshold=0.9):
    if query_word[-1] == " ":
        query_word = query_word[:-1]
    for sentence_index, sentence in enumerate(sentences):
        words = re.findall(r'\b\w+\b', sentence)
        for word_index, word in enumerate(words):
            similarity = difflib.SequenceMatcher(None, query_word.lower(), word.lower()).ratio()
            if similarity >= similarity_threshold:
                start = sentence.index(word)
                end = start + len(word)
                return [[sentence_index, start, end]]
    return None


for mode in ['train', 'dev', 'test']:
    df = pd.read_csv(files[mode])
    ids = df['Archive_id']
    df = df[list(replace_to.keys())].rename(columns=replace_to).fillna("[None]")
    df = df.applymap(str.lower)
    values = df.values.tolist()
    keys = df.columns.tolist()

    not_found = {i:[] for i in keys[:-1]}
    all_events = []
    for idx, sample_i in enumerate(values):
        sentences = nltk.sent_tokenize(sample_i[-1].lower())
        event_i = [0, "outbreak", {}]
        valid_spans = []
        span2role = {}
        span2range = {}
        ranges = []

        found = []
        for r in range(len(keys[:-1])):
            if sample_i[r] == "[none]" or sample_i[r] == "unknown" or sample_i[r] == "none" or sample_i[r] == "None":
                event_i[2][keys[r]] = []
                continue
            tar_i_r = sample_i[r].split("; ")
            valid_i = []

            for span_i in tar_i_r:
                if span_i in found:
                    continue
                result = find_word_exact(sentences, span_i)
                # if not result:
                #     result = find_word_partial(sentences, span_i)
                if not result:
                    result = find_word_sim(sentences, span_i)
                if not result:
                    not_found[keys[r]].append([span_i, idx])
                else:
                    new_span = sentences[result[0][0]][result[0][1]:result[0][2]]
                    found.append(span_i)
                    valid_i.append(new_span)
                    ranges.append(result)
                    span2range[new_span] = result
            
            for span_i in valid_i:
                span2role[span_i] = keys[r]
                valid_spans.append(span_i)
            
            event_i[2][keys[r]] = valid_i

        # find range
        all_events.append([str(idx), {"sentences": sentences, "spans": valid_spans, "ranges": ranges, "span2range": span2range, "span2role": span2role, "event": [event_i]}])
    
    out_dir = "DE-PPN/Data/epiai/"
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    with open(out_dir+mode+".json", "w") as file:
        json.dump(all_events, file, indent=4)
            



In [117]:
all_events[0]

['0',
 {'sentences': ['las vegas public health officials say dozens of people linked to a tuberculosis outbreak at a neonatal unit have tested positive for the disease.',
   'the southern nevada health district reported on monday  that of the 977 people tested, 59 showed indications of the disease and 2 showed signs of being contagious.',
   'dr joe iser, chief medical officer at the health district, says the report demonstrates the importance of catching tuberculosis early.',
   "health officials tested hundreds of babies, family members, and staff who were at summerlin hospital medical center's neonatal intensive care unit this past summer [2013], saying they wanted to take extra precautions after the death of a mother and her twin babies.",
   'they contacted the parents of about 140 babies who were at the unit between mid-may and mid-august [2013].'],
  'spans': ['tuberculosis', 'nevada', 'las vegas'],
  'ranges': [[[0, 67, 79]], [[1, 13, 19]], [[0, 0, 9]]],
  'span2range': {'tuber

In [105]:
values[-1]

['ebolavirus  infection',
 'ghana ',
 'greater accra region',
 'accra ',
 'ebolavirus ',
 '[none]',
 'human',
 'the american who was suspected to be carrying the ebolavirus at the nyaho clinic in accra is reported dead after blood testing on him revealed signs of the disease were glaring. the american, name withheld, died yesterday afternoon  while under surveillance at the infirmary. he arrived from guinea on sunday  and reported to the clinic for medical attention .\n\nthe initial tests run on the american, now deceased, according to the source, were inconclusive because the officials used the wrong reagent. the sources, who are medical practitioners, told joy news, "the test should have taken noguchi  not more than 5 hours. but myjoyonline.com learnt from the public relations officer of the health ministry, tony goodman, that his outfit had requested some reagents from the kwame nkrumah university of science and technology [kumasi, ghana] to further help with the investigations. sig

In [83]:
for i, t, a in [all_events[0][1]['event']]:
    print(i)

0


In [78]:
i, t, a = [0,
           'outbreak',
           {'disease': ['tuberculosis'],
            'country': [],
            'province': ['nevada'],
            'city': ['las vegas'],
            'virus': [],
            'victims': []}]

In [79]:
i, t, a = all_events[0][1]['event']

In [9]:
sample_fin_data = json.load(open("DE-PPN/Data/sample_500.json", "r"))

In [24]:

    return None

# Example usage:
sentences = [
    "Some fungal infections can affect humans as well.",
    "fungal disease are common in plants.",
    "Bacterial infections disease can also be harmful.",
]

words = "fungal disease"

results = find_word_exact(sentences, words)
print("Results: ", results)


results = find_word_partial(sentences, words)
print("Results: ", results)


Results:  [1, 0, 14]
Results:  [0, 5, 11]


In [None]:


## For OneIE
# doc_id = Archive_id
# sent_id; do we need sent_id? treat doc as a whole?
# tokens = use tokenizer
# pieces
# token_lens
# sentence
# entity_mentions
## in entity_mentions: id, text, entity_type, mention_type, entity_subtype, start, end
# relation_mentions
# event_mentions
## in event mention: id, event_type, trigger {text, start, end},arguments [{entity_id, text, role}] 

