## Read/Write with zongxiong's webanno parser

In [42]:
! pip install seqeval



In [43]:
from webanno_tsv import webanno_tsv_read_file, webanno_tsv_write, Annotation
from dataclasses import dataclass, replace
import os
import utils

In [44]:
# Step 1: Read the WebAnno TSV file
file_name = '231sm_Low_Resource_KBP_master_README.md.tsv'
input_file_path = f'../data/train/{file_name}'  # Replace with the path to the provided WebAnno TSV file
ref_doc = webanno_tsv_read_file(input_file_path)
print('number of ref doc annotations: ', len(ref_doc.annotations))
# Step 2: Access and modify annotations
# For example, let's add a new annotation or modify existing ones
# Here, we assume that the layer name is 'NER' and the field is 'label'
# layer  = ref_doc.annotations[0].layer
# field_name = 'value'

number of ref doc annotations:  40


In [45]:
# Example: Add a new annotation for a specific token
# Find the token you want to annotate (e.g., the first token in the first sentence)
tokens_to_annotate = ref_doc.tokens[0:18]

# Create a new annotation for the token
new_annotation = utils.make_annotation(tokens=tokens_to_annotate, label='SOFTWARE')

# Add the new annotation to the document
predicted_doc = utils.replace_webanno_annotations(ref_doc, annotations=[*doc.annotations, new_annotation])
print('number of ref doc annotations: ', len(ref_doc.annotations))
print('number of predicted doc annotations: ', len(predicted_doc.annotations))

# predicted_doc = utils.make_webanno_document(ref_doc.sentences, ref_doc.tokens, [*ref_doc.annotations, new_annotation])

# Step 3: Write the modified document to a new TSV file
output_file_path = f'../results/{file_name}'  # Replace with the desired output file path
with open(output_file_path, 'w+', encoding='utf-8') as f:
    f.write(predicted_doc.tsv())


print(f"Modified annotations have been written to {output_file_path}")

number of ref doc annotations:  40
number of predicted doc annotations:  41
Modified annotations have been written to ../results/231sm_Low_Resource_KBP_master_README.md.tsv


In [47]:
# Verify
def verify(ref_doc, predicted_doc):
    assert ref_doc.text == predicted_doc.text, 'content changed'
    assert len(ref_doc.sentences) == len(predicted_doc.sentences), 'sentences changed'
    assert len(ref_doc.tokens) == len(predicted_doc.tokens), 'tokens changed'
    for s1, s2 in zip(ref_doc.sentences, predicted_doc.sentences):
        assert s1 == s2, f'sentence changed, \n{s1}\n{s2}'

    for t1, t2 in zip(ref_doc.tokens, predicted_doc.tokens):
        assert t1 == t2, f'token changed: \n{t1}\n{t2}'

    print(f"Predicted {len(predicted_doc.annotations)} annotations")
    print(predicted_doc.annotations[-1])

verify(doc, predicted_doc)

Predicted 41 annotations
Annotation(tokens=[Token(sentence_idx=1, idx=1, start=0, end=1, text='#'), Token(sentence_idx=1, idx=2, start=2, end=18, text='Low_Resource_KBP'), Token(sentence_idx=1, idx=3, start=19, end=28, text='knowledge'), Token(sentence_idx=1, idx=4, start=29, end=34, text='graph'), Token(sentence_idx=1, idx=5, start=35, end=45, text='population'), Token(sentence_idx=1, idx=6, start=46, end=48, text='in'), Token(sentence_idx=1, idx=7, start=49, end=52, text='low'), Token(sentence_idx=1, idx=8, start=53, end=61, text='resource'), Token(sentence_idx=1, idx=9, start=62, end=72, text='conditions'), Token(sentence_idx=1, idx=10, start=75, end=78, text='The'), Token(sentence_idx=1, idx=11, start=79, end=83, text='file'), Token(sentence_idx=1, idx=12, start=84, end=85, text='"'), Token(sentence_idx=1, idx=13, start=85, end=86, text='*'), Token(sentence_idx=1, idx=14, start=86, end=106, text='Few-Shot_ED.json.zip'), Token(sentence_idx=1, idx=14, start=86, end=97, text='Few-Shot

In [48]:
print(doc.layer_defs)

[('de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity', ['identifier', 'value'])]


In [49]:
print(doc.annotations[0])

Annotation(tokens=[Token(sentence_idx=1, idx=1, start=0, end=1, text='#'), Token(sentence_idx=1, idx=2, start=2, end=18, text='Low_Resource_KBP'), Token(sentence_idx=1, idx=3, start=19, end=28, text='knowledge'), Token(sentence_idx=1, idx=4, start=29, end=34, text='graph'), Token(sentence_idx=1, idx=5, start=35, end=45, text='population'), Token(sentence_idx=1, idx=6, start=46, end=48, text='in'), Token(sentence_idx=1, idx=7, start=49, end=52, text='low'), Token(sentence_idx=1, idx=8, start=53, end=61, text='resource'), Token(sentence_idx=1, idx=9, start=62, end=72, text='conditions'), Token(sentence_idx=1, idx=10, start=75, end=78, text='The'), Token(sentence_idx=1, idx=11, start=79, end=83, text='file'), Token(sentence_idx=1, idx=12, start=84, end=85, text='"'), Token(sentence_idx=1, idx=13, start=85, end=86, text='*'), Token(sentence_idx=1, idx=14, start=86, end=106, text='Few-Shot_ED.json.zip'), Token(sentence_idx=1, idx=14, start=86, end=97, text='Few-Shot_ED'), Token(sentence_idx

In [50]:
def rm_annotations_in_files(folder_path="../data/"):
    # Iterate through files in the folder
    output_path = folder_path+"/test"
    os.makedirs(output_path, exist_ok=True)
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.tsv'):
            file_path = os.path.join(folder_path, file_name)
            exported_doc = webanno_tsv_read_file(file_path)
            converted_doc = replace(exported_doc, annotations=[])
            with open(os.path.join(output_path, file_name), "w+", encoding="utf-8") as f:
                f.write(converted_doc.tsv())
            
rm_annotations_in_files()

In [51]:
# List of all 10 entity types
label_list = [
    'CONFERENCE', 'DATASET', 'EVALMETRIC', 'LICENSE', 'ONTOLOGY', 
    'PROGLANG', 'PROJECT', 'PUBLICATION', 'SOFTWARE', 'WORKSHOP'
]
def dummy_whole_sent(folder_path="../data/"):
    # Define output folder
    output_path = folder_path + "pred/"
    os.makedirs(output_path, exist_ok=True)  # Create output folder if it doesn't exist

    # Iterate through files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".tsv"):
            file_path = os.path.join(folder_path, file_name)
            # Read the WebAnno TSV file
            doc = webanno_tsv_read_file(file_path)
            layer  = doc.annotations[0].layer

            # Create dummy annotations for every sentence and every label
            new_annotations = []
            for sentence in doc.sentences:
                # Get the tokens in the sentence
                sentence_tokens = doc.sentence_tokens(sentence)
                #if not sentence_tokens:
                #    continue  # Skip empty sentences

                # Create annotations for every label
                #for idx, lb in enumerate(label_list):
                for lb in ["DATASET"]:
                    new_anno = Annotation(
                                    tokens=sentence_tokens,
                                    layer=layer,  # Assuming the layer is "NER"
                                    field="value",  # Assuming the field is "label"
                                    label=lb,
                                    label_id=-1  # Use a dummy label ID
                                )
                    new_annotations.append(new_anno)

            # Create a new document with the dummy annotations
            new_doc = replace(doc, annotations=new_annotations)
            output_file_path = output_path + file_name
            print(f"Predictions written to {output_file_path}")
            # Write the predictions to a WebAnno TSV file
            with open(output_file_path, "w+", encoding="utf-8") as f:
                f.write(new_doc.tsv())

            written_doc = webanno_tsv_read_file(output_file_path)
            verify(doc, written_doc)
            
# Run the function
dummy_whole_sent()

## Try BIO

In [52]:
from typing import List, Union
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2


LABELS = [
    'CONFERENCE',
    'DATASET',
    'EVALMETRIC',
    'LICENSE',
    'ONTOLOGY',
    'PROGLANG',
    'PROJECT',
    'PUBLICATION',
    'SOFTWARE',
    'WORKSHOP'
]


def to_char_bio(src_path: str, ref_path: str) -> List[List[str]]:
    ref_doc = webanno_tsv_read_file(ref_path)

    # Parse the WebAnno TSV file
    doc = webanno_tsv_read_file(src_path)
    # Initialize a list to store character-level BIO tags
    bio_tags_list = []
    for target_label in LABELS:
        bio_tags = ['#'] * len(doc.text)  # Default to '#' for all characters
        # Pick interested sentences and default to 'O'
        for annotation in ref_doc.annotations:
            label = annotation.label
            if label != target_label:
                continue
            sentences = doc.annotation_sentences(annotation)
            for sentence in sentences:
                tokens = doc.sentence_tokens(sentence)
                start_char, end_char = tokens[0].start, tokens[-1].end
                bio_tags[start_char:end_char] = ['O'] * (end_char-start_char)

        for annotation in doc.annotations:
            label = annotation.label
            if label != target_label:
                continue

            start_token, end_token = annotation.tokens[0], annotation.tokens[-1]
            start_char = start_token.start
            end_char = end_token.end
            # Sanity check
            if ref_doc.text[start_char:end_char] != annotation.text:
                msg = f"ERROR: src: {src_path}, annotated '{annotation.text}', text: '{ref_doc.text[start_char:end_char]}'"
                print(msg)

            # Assign BIO tags to characters in the entity span
            if 'I-' in bio_tags[start_char]:
                # It's inside other ENTITY, skip it
                pass
            else:
                bio_tags[start_char] = f'B-{label}'  # Beginning of the entity

            for i in range(start_char + 1, end_char):
                bio_tags[i] = f'I-{label}'  # Inside the entity

        # Remove unannotated sentences from bio list.
        bio_tags = [x for x in filter(lambda x: x != '#', bio_tags)]
        if len(bio_tags) > 0:
            bio_tags_list.append(bio_tags)

    return bio_tags_list

In [53]:
ref_dir = '../data/'
pred_dir = '../results/'

ref_file_names = sorted([fp for fp in os.listdir(ref_dir) if os.path.isfile(f'{ref_dir}/{fp}') and fp.endswith('.tsv')])

all_ref_bio_tags_list = []
for ref_file_name in ref_file_names:
    src_path = os.path.join(ref_dir, ref_file_name)
    ref_path = src_path
    all_ref_bio_tags_list.append(to_char_bio(src_path, ref_path))

pred_file_names = sorted([fp for fp in os.listdir(pred_dir) if os.path.isfile(f'{pred_dir}/{fp}') and fp.endswith('.tsv')])

all_pred_bio_tags_list = []
for idx, ref_file_name in enumerate(ref_file_names):
    try:
        src_path = os.path.join(pred_dir, ref_file_name)
        ref_path = os.path.join(ref_dir, ref_file_name)
        all_pred_bio_tags_list.append(to_char_bio(src_path, ref_path))
    except FileNotFoundError:
        nbr_labels = len(all_ref_bio_tags_list[idx])
        pred = []
        for label_idx in range(nbr_labels):
            pred.append(['O'] * len(all_ref_bio_tags_list[idx][label_idx]))
        print(f"WARN: {ref_file_name} is missing, fill 'O' list as default prediction")
        all_pred_bio_tags_list.append(pred)

# Sanity checking
for ref_list, pred_list in zip(all_ref_bio_tags_list, all_pred_bio_tags_list):
    for ref, pred in zip(ref_list, pred_list):
        # print(len(ref), len(pred))
        assert len(ref) == len(pred)