In [16]:
import re
import sys
sys.path.append('./readme2kg-exp/src/')
import os
import random
from collections import defaultdict
from termcolor import colored
from functools import partial, reduce
import operator as op
import hashlib
import multiprocessing as mp
import logging

from predictor import BasePredictor, LABELS
from webanno_tsv import webanno_tsv_read_file, Document, Annotation, Token
import utils
import cleaner

In [17]:
phase = 'test_unlabeled'
base_path = f'../data/{phase}'
file_names = [fp for fp in os.listdir(base_path) if os.path.isfile(os.path.join(base_path, fp)) and fp.endswith('.tsv')]
model_name = 'Meta-Llama-3-8B-Instruct'
output_folder = f'../results/{model_name}/{phase}'
os.makedirs(output_folder, exist_ok=True)

In [18]:
print(os.getcwd())
print(output_folder)

d:\KIT2025\GitHub\readme2kg-exp\src
../results/Meta-Llama-3-8B-Instruct/test_unlabeled


In [19]:
prompt_id = 0
prompt_template_path = f'../config/deepseek-chat-prompt-0.txt'
if os.path.isfile(prompt_template_path):
    with open(prompt_template_path, 'r') as fd:
        prompt_template = fd.read()
else:
    prompt_template = ''

print(prompt_template)

**Task:**
You are tasked with performing Named Entity Recognition (NER) on the given text. Follow the guidelines strictly to identify and classify entities into their respective categories. Annotate the entities directly in the original text using XML-style tags. Only return the annotated text in Markdown format—no explanations, introductions, or extra text.


**Guidelines:**

1. **Entity Classes:**
   - **CONFERENCE**: Conference events.
     *Definition*:
     A formal meeting or gathering focused on a particular field of study or topic.
     *Example*:
     `<CONFERENCE>International Semantic Web Conference 2019</CONFERENCE>`
     `<CONFERENCE>ISWC 2019</CONFERENCE>`
     `<CONFERENCE>CVPR2023</CONFERENCE> workshop`

   - **DATASET**: Structured collections of data.
     *Definition*:
     A structured collection of data, organized typically for a specific goal such as analysis, research, or reference.
     *Example*:
     `<DATASET>Maules Creek</DATASET>`
     `Download the <DATASE

# Load Mistral model

In [None]:
import torch
torch.cuda.is_available()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.generation_config.pad_token_id = tokenizer.pad_token_id

sentence_text = """# DejaVu ## Table of Contents =================    * [Code](#code)     * [Install Requirements](#install-requirements)     * [Usage](#usage)     * [Example](#example)   * [Datasets](#datasets)   * [Deployment and Failure Injection Scripts of Train-Ticket](#deployment-and-failure-injection-scripts-of-train-ticket)   * [Citation](#citation)   * [Supplementary details](#supplementary-details)    ## Paper A preprint version: https://arxiv.org/abs/2207.09021 ## Code ### Install 1."""
prompt = prompt_template.replace('{input_text}', sentence_text)
# original code
#prompt = prompt_template.replace('{input_text}', sentence.text)

messages = [
    {"role": "system", "content": "You are a helpful NER annotator."},
    {"role": "user", "content": prompt},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
    
outputs = model.generate(
    input_ids,
    max_new_tokens=255,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

In [22]:
def do_prediction(sentence, tokens, sid_path):
    try:
        print(f"Process-{os.getpid()} processing {colored(sentence.text, 'red')} ...")
        prompt = prompt_template.replace('{input_text}', sentence.text)

        messages = [
            {"role": "system", "content": "You are a helpful NER annotator."},
            {"role": "user", "content": prompt},
        ]        
        
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
            
        outputs = model.generate(
            input_ids,
            max_new_tokens=255,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
        response = outputs[0][input_ids.shape[-1]:]
        result = tokenizer.decode(response, skip_special_tokens=True)
        
        #print(f"Process-{os.getpid()} predict {colored(sentence.text, 'cyan')} successfully")
        with open(sid_path, 'w') as file:
            file.write(result)
    except Exception as ex:
        logging.error(f'[do_prediction] got exception: {ex}')

In [23]:
def extract_annotation_labels_if_possible(predicted_text):
    label_to_text_list = defaultdict(list)
    acc_adjusted_pos = 0

    matched_labels = {}

    for label in LABELS:
        regex = f'<{label}>(.*?)</{label}>'
        matches = re.finditer(regex, predicted_text, flags=re.IGNORECASE | re.DOTALL)
        for m in matches:
            matched_labels[m.start(1)] = label

    for pos in sorted(matched_labels):
        label = matched_labels[pos]
        regex = f'<{label}>(.*?)</{label}>'
        matches = re.finditer(regex, predicted_text, flags=re.IGNORECASE | re.DOTALL)
        for m in matches:
            adjusted_pos = len(label) + 2
            label_to_text_list[label].append({
                'text': m.group(1),
                'start': m.start(1) - adjusted_pos - acc_adjusted_pos,
                'end': m.end(1) - adjusted_pos - acc_adjusted_pos,
            })
            acc_adjusted_pos += adjusted_pos * 2 + 1

    return label_to_text_list




def post_process(predicted_text, tokens):
    cleaned_text = cleaner.Cleaner(predicted_text).clean()
    label_to_text_list = extract_annotation_labels_if_possible(cleaned_text)
    return label_to_text_list

In [24]:
def predict(sentence, tokens):
    path = f'../results/{model_name}/prompt-{prompt_id}/zzz_{file_name}' # NOTE: prefix zzz for directory sorting, non-sense
    os.makedirs(path, exist_ok=True)
    sid = hashlib.sha256(sentence.text.encode()).hexdigest()[:8]
    if not os.path.isfile(f'{path}/{sid}.txt'):   # original code
    # if os.path.isdir(f'{path}'):
        do_prediction(sentence, tokens, f'{path}/{sid}.txt')

    with open(f'{path}/{sid}.txt', 'r') as fd:
        predicted_text = fd.read()

    label_to_text_list = post_process(predicted_text, tokens)
    # NOTE: sanity checking
    for label, text_list in label_to_text_list.items():
        for text in text_list:
            if text['text'] != sentence.text[text['start']:text['end']]:
                prompt = prompt_template.replace('{input_text}', sentence.text)
                #logging.warning(f"BUG? The predicted text is not exact the same as the original text. \n\nPrompt: {prompt}\nOriginal: {colored(sentence.text, 'green')}\nGenerated: {colored(text['text'], 'red')}\n--------------------------------------------------------------------------------")

    span_tokens_to_label_list = []
    for label, text_list in label_to_text_list.items():
        for text in text_list:
            span_tokens_to_label_list.append({
                'span_tokens': utils.make_span_tokens(tokens, text['start'], text['end']),
                'label': label
            })
    return span_tokens_to_label_list


In [25]:
def call_serial(doc: Document):
    annotations = []
    for sent in doc.sentences:
        tokens = doc.sentence_tokens(sent)
        span_tokens_to_label_list = predict(sentence=sent, tokens=tokens)
        
        # create the annotation instances
        for span_tokens_to_label in span_tokens_to_label_list:
            span_tokens = span_tokens_to_label['span_tokens']
            label = span_tokens_to_label['label']
            if span_tokens is None:
                continue

            annotation = utils.make_annotation(tokens=span_tokens, label=label)
            annotations.append(annotation)

    result = utils.replace_webanno_annotations(doc, annotations=annotations)
    return result

In [26]:
print(output_folder)
print(os.getcwd())

../results/Meta-Llama-3-8B-Instruct/test_unlabeled
d:\KIT2025\GitHub\readme2kg-exp\src


In [27]:
for file_name in file_names:
    file_path = os.path.join(base_path, file_name)
    ref_doc = webanno_tsv_read_file(file_path)
    predicted_doc = call_serial(ref_doc)
    # Verify
    if ref_doc.text != predicted_doc.text:
        #logging.warning('content changed')
        pass
    if len(ref_doc.sentences) == len(predicted_doc.sentences):
        #logging.warning('sentences changed')
        pass
    if len(ref_doc.tokens) == len(predicted_doc.tokens):
        #logging.warning('tokens changed')
        pass
    for s1, s2 in zip(ref_doc.sentences, predicted_doc.sentences):
        if s1 == s2:
            #logging.warning(f'sentence changed, \n{s1}\n{s2}')
            pass

    for t1, t2 in zip(ref_doc.tokens, predicted_doc.tokens):
        if t1 == t2:
            #logging.warning(f'token changed: \n{t1}\n{t2}')
            pass

    logging.warning(f"Predicted {len(predicted_doc.annotations)} annotations")
    prediction_path = os.path.join(output_folder, file_name)
    with open(prediction_path, 'w') as fd:
        fd.write(predicted_doc.tsv())





# Scorer.py

In [28]:
import argparse
import json
import os
from collections import defaultdict
from functools import reduce
from webanno_tsv import webanno_tsv_read_file, Document, Annotation
from typing import List, Union
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

LABELS = [
    'CONFERENCE',
    'DATASET',
    'EVALMETRIC',
    'LICENSE',
    'ONTOLOGY',
    'PROGLANG',
    'PROJECT',
    'PUBLICATION',
    'SOFTWARE',
    'WORKSHOP'
]

def to_char_bio(src_path: str, ref_path: str) -> List[List[str]]:
    ref_doc = webanno_tsv_read_file(ref_path)
    # Parse the WebAnno TSV file
    doc = webanno_tsv_read_file(src_path)
    # Initialize a list to store character-level BIO tags
    bio_tags_list = []
    for target_label in LABELS:
        bio_tags = ['#'] * len(ref_doc.text)  # Default to '#' for all characters
        # Pick interested sentences and default them to 'O'
        for annotation in ref_doc.annotations:
            label = annotation.label
            if label != target_label:
                continue
            sentences = ref_doc.annotation_sentences(annotation)
            for sentence in sentences:
                tokens = ref_doc.sentence_tokens(sentence)
                start_char, end_char = tokens[0].start, tokens[-1].end
                bio_tags[start_char:end_char] = ['O'] * (end_char-start_char)

        for annotation in doc.annotations:
            label = annotation.label
            if label != target_label:
                continue

            start_token, end_token = annotation.tokens[0], annotation.tokens[-1]
            start_char = start_token.start
            end_char = end_token.end
            # Sanity check
            if ref_doc.text[start_char:end_char] != annotation.text:
                msg = f"ERROR: src: {src_path}, annotated '{annotation.text}', text: '{ref_doc.text[start_char:end_char]}'"
                print(msg)

            if 'I-' in bio_tags[start_char]:
                # Overlapping, it's annotated by another annotations, we connect them as one annotations
                pass
            else:
                if bio_tags[start_char] != '#':
                    # Assign BIO tags to characters in the entity span
                    bio_tags[start_char] = f'B-{label}'  # Beginning of the entity

            for i in range(start_char + 1, end_char):
                if bio_tags[i] != '#':
                    bio_tags[i] = f'I-{label}'  # Inside the entity

        # Remove unannotated sentences from bio list.
        bio_tags = [x for x in filter(lambda x: x != '#', bio_tags)]
        bio_tags_list.append(bio_tags)

    return bio_tags_list

In [29]:
def flatten(lst):
    return reduce(lambda x, y: x + y, lst)

In [30]:
import os 

print(os.getcwd())
ref_dir = '../results/Meta-Llama-3-8B-Instruct/test_unlabeled/'
pred_dir = '../results/Meta-Llama-3-8B-Instruct/prompt-0/'
score_dir = '../results/scores/'

os.makedirs(pred_dir, exist_ok=True)
os.makedirs(score_dir, exist_ok=True)

ref_file_names = sorted([fp for fp in os.listdir(ref_dir) if os.path.isfile(f'{ref_dir}/{fp}') and fp.endswith('.tsv')])

if len(ref_file_names) == 0:
    raise Exception("ERROR: No reference files found, configuration error?")

all_ref_bio_tags_list = []
for ref_file_name in ref_file_names:
    src_path = os.path.join(ref_dir, ref_file_name)
    ref_path = src_path
    all_ref_bio_tags_list.append(to_char_bio(src_path, ref_path))

pred_file_names = sorted([fp for fp in os.listdir(pred_dir) if os.path.isfile(f'{pred_dir}/{fp}') and fp.endswith('.tsv')])
all_pred_bio_tags_list = []
for idx, ref_file_name in enumerate(ref_file_names):
    try:
        src_path = os.path.join(pred_dir, ref_file_name)
        ref_path = os.path.join(ref_dir, ref_file_name)
        all_pred_bio_tags_list.append(to_char_bio(src_path, ref_path))
    except FileNotFoundError:
        nbr_labels = len(all_ref_bio_tags_list[idx])
        assert nbr_labels == len(LABELS), "ERROR: reference tags doesn't have ${len(LABELS)} labels."
        pred = []
        for label_idx in range(nbr_labels):
            pred.append(['O'] * len(all_ref_bio_tags_list[idx][label_idx]))

        print(f"WARN: {ref_file_name} is missing, fill 'O' list as default prediction")
        all_pred_bio_tags_list.append(pred)
# Sanity checking
for idx, (ref_list, pred_list) in enumerate(zip(all_ref_bio_tags_list, all_pred_bio_tags_list)):
    for label_idx, (ref, pred) in enumerate(zip(ref_list, pred_list)):
        assert len(ref) == len(pred), f'ERROR: {ref_file_names[idx]}, label: {LABELS[label_idx]}, reference length: {len(ref)}, prediction length: {len(pred)}'

scores = {}
################################################################################
# Consider whole dataset
################################################################################
ref_bio_tags_list = flatten(flatten(all_ref_bio_tags_list))
pred_bio_tags_list = flatten(flatten(all_pred_bio_tags_list))

accuracy = accuracy_score(ref_bio_tags_list, pred_bio_tags_list)
scores['overall_accuracy'] = accuracy
average = 'macro'
ref_bio_tags_list = flatten(flatten(all_ref_bio_tags_list))
pred_bio_tags_list = flatten(flatten(all_pred_bio_tags_list))

f1 = f1_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
precision = precision_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
recall = recall_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
scores[f"overall_{average}_precision"] = precision
scores[f"overall_{average}_recall"] = recall
scores[f"overall_{average}_f1"] = f1


################################################################################
# For each class
################################################################################
label_to_ref_bio_tags_list = defaultdict(list)
label_to_pred_bio_tags_list = defaultdict(list)
for ref_bio_tags_list, pred_bio_tags_list in zip(all_ref_bio_tags_list, all_pred_bio_tags_list):
    if len(ref_bio_tags_list) != len(LABELS):
        print('ERROR: ref bio tags list')
    if len(pred_bio_tags_list) != len(LABELS):
        print('ERROR: pred bio tags list')

    for label, ref_bio_tags, pred_bio_tags in zip(LABELS, ref_bio_tags_list, pred_bio_tags_list):
        label_to_ref_bio_tags_list[label].extend(ref_bio_tags)
        label_to_pred_bio_tags_list[label].extend(pred_bio_tags)
        if len(label_to_ref_bio_tags_list[label]) != len(label_to_pred_bio_tags_list[label]):
            print('ERROR: label_to_ref_pred_bio_tags')


for label in label_to_ref_bio_tags_list.keys():
    ref_bio_tags_list = label_to_ref_bio_tags_list[label]
    pred_bio_tags_list = label_to_pred_bio_tags_list[label]
    accuracy = accuracy_score(ref_bio_tags_list, pred_bio_tags_list)
    f1 = f1_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
    precision = precision_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
    recall = recall_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
    scores[f"{label}_{average}_precision"] = precision
    scores[f"{label}_{average}_recall"] = recall
    scores[f"{label}_{average}_f1"] = f1

print("Scores:\n", json.dumps(scores, indent=2))

with open(os.path.join(score_dir, 'Meta-Llama-3-8B-Instruct-scores.json'), 'w') as fd:
    json.dump(scores, fd, indent=2)

d:\KIT2025\GitHub\readme2kg-exp\src
WARN: 231sm_Low_Resource_KBP_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: ARM-software_keyword-transformer_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: Cardio-AI_3d-mri-domain-adaptation_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: ChopinSharp_ref-nms_main_README.md.tsv is missing, fill 'O' list as default prediction
WARN: James-Durant_fisher-information_main_README.md.tsv is missing, fill 'O' list as default prediction
WARN: MELALab_nela-gt-2019_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: allenai_aspire_main_README.md.tsv is missing, fill 'O' list as default prediction
WARN: alpiges_LinConGauss_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: anonymous-submission-22_dejavu_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: aspiaspace_earthpt_main_README.md.tsv is missing, fill 'O' 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Scores:
 {
  "overall_accuracy": 0.9559659090909091,
  "overall_macro_precision": 0.19119318181818182,
  "overall_macro_recall": 0.2,
  "overall_macro_f1": 0.1954974582425563,
  "CONFERENCE_macro_precision": NaN,
  "CONFERENCE_macro_recall": NaN,
  "CONFERENCE_macro_f1": NaN,
  "DATASET_macro_precision": 0.3258739213105163,
  "DATASET_macro_recall": 0.3333333333333333,
  "DATASET_macro_f1": 0.3295614229716737,
  "EVALMETRIC_macro_precision": NaN,
  "EVALMETRIC_macro_recall": NaN,
  "EVALMETRIC_macro_f1": NaN,
  "LICENSE_macro_precision": NaN,
  "LICENSE_macro_recall": NaN,
  "LICENSE_macro_f1": NaN,
  "ONTOLOGY_macro_precision": NaN,
  "ONTOLOGY_macro_recall": NaN,
  "ONTOLOGY_macro_f1": NaN,
  "PROGLANG_macro_precision": NaN,
  "PROGLANG_macro_recall": NaN,
  "PROGLANG_macro_f1": NaN,
  "PROJECT_macro_precision": NaN,
  "PROJECT_macro_recall": NaN,
  "PROJECT_macro_f1": NaN,
  "PUBLICATION_macro_precision": 0.3144163875289874,
  "PUBLICATION_macro_recall": 0.3333333333333333,
  "PUBLI

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
