# Read/Write with zongxiong's webanno parser

In [5]:
! pip install seqeval



In [2]:
from webanno_tsv import webanno_tsv_read_file, webanno_tsv_write, Annotation
from dataclasses import dataclass, replace
import os
import utils

## Read data

In [4]:
# Read the WebAnno TSV file
file_name = '231sm_Low_Resource_KBP_master_README.md.tsv'
input_file_path = f'../data/train/{file_name}'  # Replace with the path to the provided WebAnno TSV file
ref_doc = webanno_tsv_read_file(input_file_path)
print('number of ref doc annotations: ', len(ref_doc.annotations))

number of ref doc annotations:  6


In [6]:
#read the first annoatation
ref_doc.annotations[0]

Annotation(tokens=[Token(sentence_idx=1, idx=14, start=86, end=97, text='Few-Shot_ED')], layer='de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity', field='value', label='DATASET', label_id=-1)

In [7]:
#read the first sentence
ref_doc.sentences[0]

Sentence(idx=1, text='# Low_Resource_KBP knowledge graph population in low resource conditions   The file "*Few-Shot_ED.json.zip*" is the ***FewEvent*** dataset for the paper accepted by WSDM 2020 ***["Meta-Learning with Dynamic-Memory-Based Prototypical Network for Few-Shot Event Detection"](https://arxiv.org/abs/1910.11621)***   ## Source of Raw Data * We first scale up the number of event types in existing datasets, including the [ACE-2005 corpus](http://projects.ldc.upenn.edu/ace/), and [TAC-KBP-2017 Event Track Data](https://tac.nist.gov/2017/KBP/Event/index.html)')

In [8]:
#read the first 3 tokens
ref_doc.tokens[:3]

[Token(sentence_idx=1, idx=1, start=0, end=1, text='#'),
 Token(sentence_idx=1, idx=2, start=2, end=18, text='Low_Resource_KBP'),
 Token(sentence_idx=1, idx=3, start=19, end=28, text='knowledge')]

In [9]:
# read the document by start, end positions
ref_doc.text[15:18]

'KBP'

### Get all labeled sentences of each entity type

In [11]:
# List of all 10 entity types
label_list = [
    'CONFERENCE', 'DATASET', 'EVALMETRIC', 'LICENSE', 'ONTOLOGY', 
    'PROGLANG', 'PROJECT', 'PUBLICATION', 'SOFTWARE', 'WORKSHOP'
]
annotations_by_label = {lb:[] for lb in label_list}

for anno in ref_doc.annotations:
    annotations_by_label[anno.label].extend(ref_doc.annotation_sentences(annotation=anno))

for lb in label_list:
    la = annotations_by_label[lb]
    if len(la) > 0:
        print(f"Labeled sentences of {lb}:")
        for sent in la:
            print(sent.text)

Labeled sentences of CONFERENCE:
# Low_Resource_KBP knowledge graph population in low resource conditions   The file "*Few-Shot_ED.json.zip*" is the ***FewEvent*** dataset for the paper accepted by WSDM 2020 ***["Meta-Learning with Dynamic-Memory-Based Prototypical Network for Few-Shot Event Detection"](https://arxiv.org/abs/1910.11621)***   ## Source of Raw Data * We first scale up the number of event types in existing datasets, including the [ACE-2005 corpus](http://projects.ldc.upenn.edu/ace/), and [TAC-KBP-2017 Event Track Data](https://tac.nist.gov/2017/KBP/Event/index.html)
Labeled sentences of DATASET:
# Low_Resource_KBP knowledge graph population in low resource conditions   The file "*Few-Shot_ED.json.zip*" is the ***FewEvent*** dataset for the paper accepted by WSDM 2020 ***["Meta-Learning with Dynamic-Memory-Based Prototypical Network for Few-Shot Event Detection"](https://arxiv.org/abs/1910.11621)***   ## Source of Raw Data * We first scale up the number of event types in e

## Write a WebAnno TSV file

### write to documents

In [12]:
def add_one_more_annotations_in_files(folder_path="../data/val/"): # data in
    # Iterate through files in the folder
    output_path = '../results/dummy'
    os.makedirs(output_path, exist_ok=True)
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.tsv'):
            file_path = os.path.join(folder_path, file_name)
            exported_doc = webanno_tsv_read_file(file_path)
            tokens_to_annotate = exported_doc.tokens[0:18]
            # Example: Add a new annotation for a specific token
            # Find the token you want to annotate (e.g., the first token in the first sentence)
            new_annotation = utils.make_annotation(tokens=tokens_to_annotate, label='SOFTWARE')
            # Add the new annotation to the document
            predicted_doc = utils.replace_webanno_annotations(exported_doc, annotations=[*exported_doc.annotations, new_annotation])
            # Step 3: Write the modified document to a new TSV file
            with open(os.path.join(output_path, file_name), "w+", encoding="utf-8") as f:
                f.write(predicted_doc.tsv())

            print('number of ref doc annotations: ', len(exported_doc.annotations))
            print('number of predicted doc annotations: ', len(predicted_doc.annotations))
            print(f"Modified annotations have been written to {os.path.join(output_path, file_name)}")

add_one_more_annotations_in_files()

number of ref doc annotations:  0
number of predicted doc annotations:  1
Modified annotations have been written to ../results/dummy/daijifeng001_TA-FCN_master_README.md.tsv
number of ref doc annotations:  0
number of predicted doc annotations:  1
Modified annotations have been written to ../results/dummy/conversationai_unhealthy-conversations_main_README.md.tsv
number of ref doc annotations:  0
number of predicted doc annotations:  1
Modified annotations have been written to ../results/dummy/OpenBioLink_ITO_master_README.md.tsv
number of ref doc annotations:  0
number of predicted doc annotations:  1
Modified annotations have been written to ../results/dummy/poloclub_diffusion-explainer_main_README.md.tsv


In [16]:
add_one_more_annotations_in_files("../data/train")

number of ref doc annotations:  43
number of predicted doc annotations:  44
Modified annotations have been written to ../results/dummy/231sm_Reasoning_In_EE_main_README.md.tsv
number of ref doc annotations:  6
number of predicted doc annotations:  7
Modified annotations have been written to ../results/dummy/231sm_Low_Resource_KBP_master_README.md.tsv


### remove all labeled and write to new files

In [13]:
def rm_annotations_in_files(folder_path="../data/train"):
    # Iterate through files in the folder
    output_path = '../results/empty'
    os.makedirs(output_path, exist_ok=True)
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.tsv'):
            file_path = os.path.join(folder_path, file_name)
            exported_doc = webanno_tsv_read_file(file_path)
            predicted_doc = replace(exported_doc, annotations=[])
            with open(os.path.join(output_path, file_name), "w+", encoding="utf-8") as f:
                f.write(predicted_doc.tsv())

            print('number of ref doc annotations: ', len(exported_doc.annotations))
            print('number of predicted doc annotations: ', len(predicted_doc.annotations))
            print(f"Modified annotations have been written to {os.path.join(output_path, file_name)}")
            
rm_annotations_in_files()

number of ref doc annotations:  43
number of predicted doc annotations:  0
Modified annotations have been written to ../results/empty/231sm_Reasoning_In_EE_main_README.md.tsv
number of ref doc annotations:  6
number of predicted doc annotations:  0
Modified annotations have been written to ../results/empty/231sm_Low_Resource_KBP_master_README.md.tsv


## Sanitary Checks

In [9]:
# Verify
def verify(ref_doc, predicted_doc):
    assert ref_doc.text == predicted_doc.text, 'content changed'
    assert len(ref_doc.sentences) == len(predicted_doc.sentences), 'sentences changed'
    assert len(ref_doc.tokens) == len(predicted_doc.tokens), 'tokens changed'
    for s1, s2 in zip(ref_doc.sentences, predicted_doc.sentences):
        assert s1 == s2, f'sentence changed, \n{s1}\n{s2}'

    for t1, t2 in zip(ref_doc.tokens, predicted_doc.tokens):
        assert t1 == t2, f'token changed: \n{t1}\n{t2}'

    print(f"Predicted {len(predicted_doc.annotations)} annotations")
    if len(predicted_doc.annotations) > 0:
        print(predicted_doc.annotations[-1])

for file_path in os.listdir('../data/train'):
    ref_doc = webanno_tsv_read_file(f'../data/train/{file_path}')
    dummy_predicted_doc = webanno_tsv_read_file(f'../results/dummy/{file_path}')
    empty_predicted_doc = webanno_tsv_read_file(f'../results/empty/{file_path}')
    verify(ref_doc, dummy_predicted_doc)
    verify(ref_doc, empty_predicted_doc)


Predicted 44 annotations
Annotation(tokens=[Token(sentence_idx=20, idx=98, start=9756, end=9769, text='International'), Token(sentence_idx=20, idx=99, start=9770, end=9775, text='Joint'), Token(sentence_idx=20, idx=100, start=9776, end=9786, text='Conference'), Token(sentence_idx=20, idx=101, start=9787, end=9789, text='on'), Token(sentence_idx=20, idx=102, start=9790, end=9797, text='Natural'), Token(sentence_idx=20, idx=103, start=9798, end=9806, text='Language'), Token(sentence_idx=20, idx=104, start=9807, end=9817, text='Processing')], layer='de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity', field='value', label='CONFERENCE', label_id=14)
Predicted 0 annotations
Predicted 7 annotations
Annotation(tokens=[Token(sentence_idx=1, idx=105, start=475, end=482, text='TAC-KBP'), Token(sentence_idx=1, idx=106, start=482, end=483, text='-'), Token(sentence_idx=1, idx=107, start=483, end=487, text='2017'), Token(sentence_idx=1, idx=108, start=488, end=493, text='Event'), Token(sentence

## Try BIO

In [14]:
from typing import List, Union
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2


LABELS = [
    'CONFERENCE',
    'DATASET',
    'EVALMETRIC',
    'LICENSE',
    'ONTOLOGY',
    'PROGLANG',
    'PROJECT',
    'PUBLICATION',
    'SOFTWARE',
    'WORKSHOP'
]


def to_char_bio(src_path: str, ref_path: str) -> List[List[str]]:
    ref_doc = webanno_tsv_read_file(ref_path)

    # Parse the WebAnno TSV file
    doc = webanno_tsv_read_file(src_path)
    # Initialize a list to store character-level BIO tags
    bio_tags_list = []
    for target_label in LABELS:
        bio_tags = ['#'] * len(doc.text)  # Default to '#' for all characters
        # Pick interested sentences and default to 'O'
        for annotation in ref_doc.annotations:
            label = annotation.label
            if label != target_label:
                continue
            sentences = doc.annotation_sentences(annotation)
            for sentence in sentences:
                tokens = doc.sentence_tokens(sentence)
                start_char, end_char = tokens[0].start, tokens[-1].end
                bio_tags[start_char:end_char] = ['O'] * (end_char-start_char)

        for annotation in doc.annotations:
            label = annotation.label
            if label != target_label:
                continue

            start_token, end_token = annotation.tokens[0], annotation.tokens[-1]
            start_char = start_token.start
            end_char = end_token.end
            # Sanity check
            if ref_doc.text[start_char:end_char] != annotation.text:
                msg = f"ERROR: src: {src_path}, annotated '{annotation.text}', text: '{ref_doc.text[start_char:end_char]}'"
                print(msg)

            # Assign BIO tags to characters in the entity span
            if 'I-' in bio_tags[start_char]:
                # It's inside other ENTITY, skip it
                pass
            else:
                bio_tags[start_char] = f'B-{label}'  # Beginning of the entity

            for i in range(start_char + 1, end_char):
                bio_tags[i] = f'I-{label}'  # Inside the entity

        # Remove unannotated sentences from bio list.
        bio_tags = [x for x in filter(lambda x: x != '#', bio_tags)]
        if len(bio_tags) > 0:
            bio_tags_list.append(bio_tags)

    return bio_tags_list

In [17]:
ref_dir = '../data/train'
pred_dir = '../results/dummy'

ref_file_names = sorted([fp for fp in os.listdir(ref_dir) if os.path.isfile(f'{ref_dir}/{fp}') and fp.endswith('.tsv')])

all_ref_bio_tags_list = []
for ref_file_name in ref_file_names:
    src_path = os.path.join(ref_dir, ref_file_name)
    ref_path = src_path
    all_ref_bio_tags_list.append(to_char_bio(src_path, ref_path))

pred_file_names = sorted([fp for fp in os.listdir(pred_dir) if os.path.isfile(f'{pred_dir}/{fp}') and fp.endswith('.tsv')])

all_pred_bio_tags_list = []
for idx, ref_file_name in enumerate(ref_file_names):
    try:
        src_path = os.path.join(pred_dir, ref_file_name)
        ref_path = os.path.join(ref_dir, ref_file_name)
        all_pred_bio_tags_list.append(to_char_bio(src_path, ref_path))
    except FileNotFoundError:
        nbr_labels = len(all_ref_bio_tags_list[idx])
        pred = []
        for label_idx in range(nbr_labels):
            pred.append(['O'] * len(all_ref_bio_tags_list[idx][label_idx]))
        print(f"WARN: {ref_file_name} is missing, fill 'O' list as default prediction")
        all_pred_bio_tags_list.append(pred)

# Sanity checking
for ref_list, pred_list in zip(all_ref_bio_tags_list, all_pred_bio_tags_list):
    for ref, pred in zip(ref_list, pred_list):
        # print(len(ref), len(pred))
        assert len(ref) == len(pred)