In [44]:
import re
import os
import hashlib
import glob
import cleaner
from collections import defaultdict
from webanno_tsv import webanno_tsv_read_file, Token
from typing import List, Optional
import utils
from predictor import LABELS

In [None]:
# Updated matc

In [None]:
def extract_annotation_labels_if_possible(predicted_text, original_text, offset=0):
    label_to_text_list = defaultdict(list)
    used_spans = set()

    all_labels = "|".join(LABELS)
    tag_pattern = re.compile(
        fr'<({all_labels})>\s*(.*?)\s*</\1>',
        flags=re.IGNORECASE | re.DOTALL
    )

    matches = list(re.finditer(tag_pattern, predicted_text))
    for match in matches:
        original_label = match.group(1)
        label = original_label.upper()
        content = match.group(2).strip()

        pattern = re.escape(content)
        content_pattern = re.compile(pattern)

        found = False
        for found_match in content_pattern.finditer(original_text):
            span = (found_match.start() + offset, found_match.end() + offset)

            if any(s < span[1] and e > span[0] for s, e in used_spans):
                continue

            label_to_text_list[label].append({
                "text": content,
                "start": span[0],
                "end": span[1]
            })
            used_spans.add(span)
            found = True
            break

        if not found:
            print(f"Warning: '{content}' not found in original_text.")

    return label_to_text_list


def post_process(predicted_text, original_text, tokens, offset):
    cleaned_text = cleaner.Cleaner(predicted_text).clean()
    label_to_text_list = extract_annotation_labels_if_possible(cleaned_text, original_text, offset)
    return label_to_text_list


def make_span_tokens(tokens: List[Token], start_char: int, end_char: int) -> Optional[List[Token]]:
    span_tokens = []
    for token in tokens:
        # Skip tokens completely outside the range
        if token.end <= start_char:
            continue
        if token.start >= end_char:
            continue

        # If token is fully inside the span, keep it
        if token.start >= start_char and token.end <= end_char:
            span_tokens.append(token)
        else:
            # Handle left boundary token
            if token.start < start_char < token.end:
                sliced_text = token.text[start_char - token.start:]
                span_tokens.append(Token(
                    idx=f"{token.idx}.L",
                    sentence_idx=token.sentence_idx,
                    start=start_char,
                    end=token.end,
                    text=sliced_text
                ))
            # Handle right boundary token
            elif token.start < end_char < token.end:
                sliced_text = token.text[:end_char - token.start]
                span_tokens.append(Token(
                    idx=f"{token.idx}.R",
                    sentence_idx=token.sentence_idx,
                    start=token.start,
                    end=end_char,
                    text=sliced_text
                ))
            # Handle token fully covering the span (rare but possible)
            elif token.start < start_char and token.end > end_char:
                sliced_text = token.text[start_char - token.start : end_char - token.start]
                span_tokens.append(Token(
                    idx=f"{token.idx}.M",
                    sentence_idx=token.sentence_idx,
                    start=start_char,
                    end=end_char,
                    text=sliced_text
                ))

    return span_tokens if span_tokens else None

In [None]:
# ## Exampleï¼š


# # Load and build full text
# org_path = f'../data/test_labeled/ARM-software_keyword-transformer_master_README.md.tsv'
# doc = webanno_tsv_read_file(org_path)
# full_text = doc.text  # This is key: get the original document full text
# annotations = []


# # Start scanning from the beginning
# cursor = 0

# for sentence in doc.sentences:
#     tokens = doc.sentence_tokens(sentence)
#     original_text = sentence.text
#     sid = hashlib.sha256(original_text.encode()).hexdigest()[:8]
#     path = f'../results/deepseek-chat/prompt-0/zzz_ARM-software_keyword-transformer_master_README.md.tsv'

#     with open(f'{path}/{sid}.txt', 'r') as fd:
#         predicted_text = fd.read()

#     # Find true position of this sentence in the original text
#     offset = full_text.find(original_text, cursor)
#     if offset == -1:
#         print(f"Error: sentence not found in full_text")
#         continue

#     # Update cursor for next search to avoid matching same sentence again
#     cursor = offset + len(original_text)

#     label_to_text_list = post_process(predicted_text, original_text, tokens, offset)
#     print(label_to_text_list)
#     span_tokens_to_label_list = []
#     for label, text_list in label_to_text_list.items():
#         for text in text_list:
#             span_tokens_to_label_list.append({
#                 'span_tokens': make_span_tokens(tokens, text['start'], text['end']), # The problem is here: make_span_tokens func 
#                 'label': label
#             })
#     print(span_tokens_to_label_list)
#     for span_tokens_to_label in span_tokens_to_label_list:
#         span_tokens = span_tokens_to_label['span_tokens']
#         label = span_tokens_to_label['label']
#         if span_tokens is None:
#             continue
#         annotation = utils.make_annotation(tokens=span_tokens, label=label)
#         annotations.append(annotation)
# predicted_doc = utils.replace_webanno_annotations(doc, annotations=annotations)
# # Verify
# if doc.text != predicted_doc.text:
#     #logging.warning('content changed')
#     pass
# if len(doc.sentences) == len(predicted_doc.sentences):
#     #logging.warning('sentences changed')
#     pass
# if len(doc.tokens) == len(predicted_doc.tokens):
#     #logging.warning('tokens changed')
#     pass
# for s1, s2 in zip(doc.sentences, predicted_doc.sentences):
#     if s1 == s2:
#         #logging.warning(f'sentence changed, \n{s1}\n{s2}')
#         pass

# for t1, t2 in zip(doc.tokens, predicted_doc.tokens):
#     if t1 == t2:
#         #logging.warning(f'token changed: \n{t1}\n{t2}')
#         pass

# with open("ARM-software_keyword-transformer_master_README.md.tsv", 'w') as fd:
#     fd.write(predicted_doc.tsv())


defaultdict(<class 'list'>, {'SOFTWARE': [{'text': 'Keyword Transformer', 'start': 2, 'end': 21}], 'PUBLICATION': [{'text': 'Keyword Transformer: A Self-Attention Model for Keyword Spotting', 'start': 163, 'end': 227}], 'CONFERENCE': [{'text': 'Interspeech 2021', 'start': 277, 'end': 293}]})
[{'span_tokens': [Token(sentence_idx=1, idx=2, start=2, end=9, text='Keyword'), Token(sentence_idx=1, idx=3, start=10, end=21, text='Transformer')], 'label': 'SOFTWARE'}, {'span_tokens': [Token(sentence_idx=1, idx=39, start=163, end=170, text='Keyword'), Token(sentence_idx=1, idx=40, start=171, end=182, text='Transformer'), Token(sentence_idx=1, idx=41, start=182, end=183, text=':'), Token(sentence_idx=1, idx=42, start=184, end=185, text='A'), Token(sentence_idx=1, idx=43, start=186, end=200, text='Self-Attention'), Token(sentence_idx=1, idx=44, start=201, end=206, text='Model'), Token(sentence_idx=1, idx=45, start=207, end=210, text='for'), Token(sentence_idx=1, idx=46, start=211, end=218, text='K

In [52]:

# Traverse all .tsv files in the input folder
ref_dir = '../data/test_labeled'
tsv_files = glob.glob(os.path.join(ref_dir, '*.tsv'))

for org_path in tsv_files:
    print(f"Processing file: {org_path}")
    doc = webanno_tsv_read_file(org_path)
    full_text = doc.text
    annotations = []
    cursor = 0

    for sentence in doc.sentences:
        tokens = doc.sentence_tokens(sentence)
        original_text = sentence.text
        sid = hashlib.sha256(original_text.encode()).hexdigest()[:8]

        # Assuming each tsv has a corresponding folder in results with the same name (excluding extension)
        file_base = os.path.splitext(os.path.basename(org_path))[0]
        src_path = f'../results/deepseek-chat/prompt-0/zzz_{file_base}.tsv'

        pred_path = os.path.join(src_path, f'{sid}.txt')
        if not os.path.exists(pred_path):
            print(f"Prediction not found: {pred_path}")
            continue

        with open(pred_path, 'r') as fd:
            predicted_text = fd.read()

        offset = full_text.find(original_text, cursor)
        if offset == -1:
            print(f"Error: sentence not found in full_text: {original_text[:30]}...")
            continue
        cursor = offset + len(original_text)

        label_to_text_list = post_process(predicted_text, original_text, tokens, offset)

        span_tokens_to_label_list = []
        for label, text_list in label_to_text_list.items():
            for text in text_list:
                span_tokens = make_span_tokens(tokens, text['start'], text['end'])
                span_tokens_to_label_list.append({
                    'span_tokens': span_tokens,
                    'label': label
                })

        for span_tokens_to_label in span_tokens_to_label_list:
            span_tokens = span_tokens_to_label['span_tokens']
            label = span_tokens_to_label['label']
            if span_tokens is None:
                continue
            annotation = utils.make_annotation(tokens=span_tokens, label=label)
            annotations.append(annotation)

    predicted_doc = utils.replace_webanno_annotations(doc, annotations=annotations)

    # Optional verification
    if doc.text != predicted_doc.text:
        pass
    if len(doc.sentences) != len(predicted_doc.sentences):
        pass
    if len(doc.tokens) != len(predicted_doc.tokens):
        pass

    for s1, s2 in zip(doc.sentences, predicted_doc.sentences):
        if s1 != s2:
            pass
    for t1, t2 in zip(doc.tokens, predicted_doc.tokens):
        if t1 != t2:
            pass

    output_path = f"../results/deepseek-chat/test_unlabeled_up/{file_base}.tsv"
    with open(output_path, 'w') as fd:
        fd.write(predicted_doc.tsv())
    print(f"Saved predicted file to: {output_path}")


Processing file: ../data/test_labeled\231sm_Low_Resource_KBP_master_README.md.tsv
Saved predicted file to: ../results/deepseek-chat/test_unlabeled_up/231sm_Low_Resource_KBP_master_README.md.tsv
Processing file: ../data/test_labeled\allenai_aspire_main_README.md.tsv
Saved predicted file to: ../results/deepseek-chat/test_unlabeled_up/allenai_aspire_main_README.md.tsv
Processing file: ../data/test_labeled\alpiges_LinConGauss_master_README.md.tsv
Saved predicted file to: ../results/deepseek-chat/test_unlabeled_up/alpiges_LinConGauss_master_README.md.tsv
Processing file: ../data/test_labeled\anonymous-submission-22_dejavu_master_README.md.tsv
Saved predicted file to: ../results/deepseek-chat/test_unlabeled_up/anonymous-submission-22_dejavu_master_README.md.tsv
Processing file: ../data/test_labeled\ARM-software_keyword-transformer_master_README.md.tsv
Saved predicted file to: ../results/deepseek-chat/test_unlabeled_up/ARM-software_keyword-transformer_master_README.md.tsv
Processing file: ../

# Scoring