In [12]:
import re
import sys
sys.path.append('./readme2kg-exp/src/')
import os
import random
from collections import defaultdict
from termcolor import colored
from functools import partial, reduce
import operator as op
import hashlib
import multiprocessing as mp
import logging

from predictor import BasePredictor, LABELS
from webanno_tsv import webanno_tsv_read_file, Document, Annotation, Token
import utils
import cleaner

In [2]:
phase = 'test_unlabeled'
base_path = f'./readme2kg-exp/data/{phase}'
file_names = [fp for fp in os.listdir(base_path) if os.path.isfile(os.path.join(base_path, fp)) and fp.endswith('.tsv')]
model_name = 'Meta-Llama-3-8B-Instruct'
output_folder = f'./readme2kg-exp/results/{model_name}/{phase}'
os.makedirs(output_folder, exist_ok=True)

In [19]:
prompt_id = 0
prompt_template_path = f'./readme2kg-exp/config/deepseek-chat-prompt-0.txt'
if os.path.isfile(prompt_template_path):
    with open(prompt_template_path, 'r') as fd:
        prompt_template = fd.read()
else:
    prompt_template = ''

print(prompt_template)

**Task:**
You are tasked with performing Named Entity Recognition (NER) on the given text. Follow the guidelines strictly to identify and classify entities into their respective categories. Annotate the entities directly in the original text using XML-style tags. Only return the annotated text in Markdown format—no explanations, introductions, or extra text.


**Guidelines:**

1. **Entity Classes:**
   - **CONFERENCE**: Conference events.
     *Definition*:
     A formal meeting or gathering focused on a particular field of study or topic.
     *Example*:
     `<CONFERENCE>International Semantic Web Conference 2019</CONFERENCE>`
     `<CONFERENCE>ISWC 2019</CONFERENCE>`
     `<CONFERENCE>CVPR2023</CONFERENCE> workshop`

   - **DATASET**: Structured collections of data.
     *Definition*:
     A structured collection of data, organized typically for a specific goal such as analysis, research, or reference.
     *Example*:
     `<DATASET>Maules Creek</DATASET>`
     `Download the <DATASE

# Load Mistral model

In [8]:
import torch
torch.cuda.is_available()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.generation_config.pad_token_id = tokenizer.pad_token_id

sentence_text = """# DejaVu ## Table of Contents =================    * [Code](#code)     * [Install Requirements](#install-requirements)     * [Usage](#usage)     * [Example](#example)   * [Datasets](#datasets)   * [Deployment and Failure Injection Scripts of Train-Ticket](#deployment-and-failure-injection-scripts-of-train-ticket)   * [Citation](#citation)   * [Supplementary details](#supplementary-details)    ## Paper A preprint version: https://arxiv.org/abs/2207.09021 ## Code ### Install 1."""
prompt = prompt_template.replace('{input_text}', sentence_text)
# original code
#prompt = prompt_template.replace('{input_text}', sentence.text)

messages = [
    {"role": "system", "content": "You are a helpful NER annotator."},
    {"role": "user", "content": prompt},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
    
outputs = model.generate(
    input_ids,
    max_new_tokens=255,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

    * [Code](#code)
     * [Install Requirements](#install-requirements)
     * [Usage](#usage)
     * [Example](#example)
   * [Datasets](#datasets)
     `<DATASET>DejaVu</DATASET>`
   * [Deployment and Failure Injection Scripts of Train-Ticket](#deployment-and-failure-injection-scripts-of-train-ticket)
   * [Citation](#citation)
   * [Supplementary details](#supplementary-details)
    ## Paper
     A preprint version: https://arxiv.org/abs/<PUBLICATION>2207.09021</PUBLICATION>


In [29]:
def do_prediction(sentence, tokens, sid_path):
    try:
        print(f"Process-{os.getpid()} processing {colored(sentence.text, 'red')} ...")
        prompt = prompt_template.replace('{input_text}', sentence.text)

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
            
        outputs = model.generate(
            input_ids,
            max_new_tokens=255,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
        response = outputs[0][input_ids.shape[-1]:]
        result = tokenizer.decode(response, skip_special_tokens=True)
        
        #print(f"Process-{os.getpid()} predict {colored(sentence.text, 'cyan')} successfully")
        with open(sid_path, 'w') as file:
            file.write(result)
    except Exception as ex:
        logging.error(f'[do_prediction] got exception: {ex}')

In [27]:
def extract_annotation_labels_if_possible(predicted_text):
    label_to_text_list = defaultdict(list)
    acc_adjusted_pos = 0
    for label in LABELS:
        regex = f'<{label}>(.*?)</{label}>'
        matches = re.finditer(regex, predicted_text, flags=re.IGNORECASE | re.DOTALL)
        for m in matches:
            adjusted_pos = len(label) + 2
            label_to_text_list[label].append({
                'text': m.group(1),
                'start': m.start(1) - adjusted_pos - acc_adjusted_pos,
                'end': m.end(1) - adjusted_pos - acc_adjusted_pos,
            })
            acc_adjusted_pos += adjusted_pos * 2 + 1
    return label_to_text_list



def post_process(predicted_text, tokens):
    cleaned_text = cleaner.Cleaner(predicted_text).clean()
    label_to_text_list = extract_annotation_labels_if_possible(cleaned_text)
    return label_to_text_list

In [28]:
def predict(sentence, tokens):
    path = f'./readme2kg-exp/results/{model_name}/prompt-{prompt_id}/zzz_{file_name}' # NOTE: prefix zzz for directory sorting, non-sense
    os.makedirs(path, exist_ok=True)
    sid = hashlib.sha256(sentence.text.encode()).hexdigest()[:8]
    #if not os.path.isfile(f'{path}/{sid}.txt'):   # original code
    if os.path.isdir(f'{path}'):
        do_prediction(sentence, tokens, f'{path}/{sid}.txt')

    with open(f'{path}/{sid}.txt', 'r') as fd:
        predicted_text = fd.read()

    label_to_text_list = post_process(predicted_text, tokens)
    # NOTE: sanity checking
    for label, text_list in label_to_text_list.items():
        for text in text_list:
            if text['text'] != sentence.text[text['start']:text['end']]:
                prompt = prompt_template.replace('{input_text}', sentence.text)
                #logging.warning(f"BUG? The predicted text is not exact the same as the original text. \n\nPrompt: {prompt}\nOriginal: {colored(sentence.text, 'green')}\nGenerated: {colored(text['text'], 'red')}\n--------------------------------------------------------------------------------")

    span_tokens_to_label_list = []
    for label, text_list in label_to_text_list.items():
        for text in text_list:
            span_tokens_to_label_list.append({
                'span_tokens': utils.make_span_tokens(tokens, text['start'], text['end']),
                'label': label
            })
    return span_tokens_to_label_list


In [31]:
def call_serial(doc: Document):
    annotations = []
    for sent in doc.sentences:
        tokens = doc.sentence_tokens(sent)
        span_tokens_to_label_list = predict(sentence=sent, tokens=tokens)
        
        # create the annotation instances
        for span_tokens_to_label in span_tokens_to_label_list:
            span_tokens = span_tokens_to_label['span_tokens']
            label = span_tokens_to_label['label']
            if span_tokens is None:
                continue

            annotation = utils.make_annotation(tokens=span_tokens, label=label)
            annotations.append(annotation)

    result = utils.replace_webanno_annotations(doc, annotations=annotations)
    return result

In [32]:
for file_name in file_names:
    file_path = os.path.join(base_path, file_name)
    ref_doc = webanno_tsv_read_file(file_path)
    predicted_doc = call_serial(ref_doc)
    # Verify
    if ref_doc.text != predicted_doc.text:
        #logging.warning('content changed')
        pass
    if len(ref_doc.sentences) == len(predicted_doc.sentences):
        #logging.warning('sentences changed')
        pass
    if len(ref_doc.tokens) == len(predicted_doc.tokens):
        #logging.warning('tokens changed')
        pass
    for s1, s2 in zip(ref_doc.sentences, predicted_doc.sentences):
        if s1 == s2:
            #logging.warning(f'sentence changed, \n{s1}\n{s2}')
            pass

    for t1, t2 in zip(ref_doc.tokens, predicted_doc.tokens):
        if t1 == t2:
            #logging.warning(f'token changed: \n{t1}\n{t2}')
            pass

    logging.warning(f"Predicted {len(predicted_doc.annotations)} annotations")
    prediction_path = os.path.join(output_folder, file_name)
    with open(prediction_path, 'w') as fd:
        fd.write(predicted_doc.tsv())

Process-1079647 processing [31mAll the software requirements are already pre-installed in the Docker image below.[0m ...
Process-1079647 processing [31mThe requirements are also listed in `requirements.txt` and `requirements-dev.txt`.[0m ...
Process-1079647 processing [31mNote that `DGL 0.8` is not released yet when I did this work, so I installed `DGL 0.8` manually from the source code.[0m ...
Process-1079647 processing [31mPyTorch version should be equal to or greater than 1.11.0.[0m ...
Process-1079647 processing [31m```bash    docker pull lizytalk/dejavu    ``` 2.[0m ...
Process-1079647 processing [31mPull the code from GitHub    ```bash    git pull https://github.com/NetManAIOps/DejaVu.git DejaVu    ``` 3.[0m ...
Process-1079647 processing [31mDownload the datasets following the link in the GitHub repo and extract the datasets into `.[0m ...
Process-1079647 processing [31m/DejaVu/data` 4.[0m ...
Process-1079647 processing [31mI use the command `realpath` in the ex



Process-1079647 processing [31m[![0m ...
Process-1079647 processing [31m[License: MIT](https://img.shields.io/badge/license-MIT-blue.svg?[0m ...
Process-1079647 processing [31mstyle=flat-square)](https://opensource.org/licenses/MIT) [![0m ...
Process-1079647 processing [31m[DOI](https://zenodo.org/badge/DOI/10.48550/arXiv.2404.13971.svg)](https://doi.org/10.48550/arXiv.2404.13971)  # ![0m ...
Process-1079647 processing [31m[HamilToniQ_logo](.[0m ...
Process-1079647 processing [31m/figures/HamilToniQ_logo.png)HamilToniQ: An Open-Source Benchmark Toolkit for Quantum Computers  Table of Contents:  1.[0m ...
Process-1079647 processing [31m[Introduction](#introduction) 2.[0m ...
Process-1079647 processing [31m[Quick Start](#quickstart) 3.[0m ...
Process-1079647 processing [31m[H-Scores](#hscores) 4.[0m ...
Process-1079647 processing [31m[Architecture](#architecture) 5.[0m ...
Process-1079647 processing [31m[How to cite](#cite)  <a name="introduction"></a>  ## Introducti



Process-1079647 processing [31m# `LinConGauss` ### _Integrals and samples of Gaussians under linear domain constraints_  ![0m ...
Process-1079647 processing [31m[Multilevel Splitting](https://repository-images.githubusercontent.com/243241472/797da100-5891-11ea-857f-0cca52af9239 "Multilevel Splitting")   ## Setup Clone the repository and run `setup.py` ```bash git clone https://github.com/alpiges/LinConGauss.git ~/LinConGauss cd ~/LinConGauss python setup.py install ```  ## Usage For usage, please refer to the tutorials in the `notebook` section.  ## How to cite If you are using `LinConGauss` for your research, consider citing the [paper](https://arxiv.org/abs/1910.09328)  ``` @inproceedings{GessnerKH2020,     title     = {Integrals over Gaussians under Linear Domain Constraints},     author    = {Alexandra Gessner and Oindrila Kanjilal and Philipp Hennig},     booktitle = {Proceedings of Machine Learning Research},     publisher = {PMLR},     year      = {2020},     url       = {htt



Process-1079647 processing [31m# 3D CMR-Domain-Adaptation  This repo contains code to train a deep learning model for **Unsupervised Domain Adaptation (UDA)** of 3D cardiac magnetic resonance (CMR) cine images to **transform from axial to short-axis orientation**.[0m ...
Process-1079647 processing [31mThe task associated to the domain adaptation is to perform a **segmentation task via a pre-trained fixed network**, and the results are leveraged to guide the transformation process (rigid transform via spatial transformer networks).[0m ...
Process-1079647 processing [31mThe trained model is able to transform an axial (AX) CMR into the patient specific short-axis (SAX) direction.[0m ...
Process-1079647 processing [31mThe model learns from paired AX/SAX CMR image pairs and a pre-trained SAX segmentation model.[0m ...
Process-1079647 processing [31mThe following gif exemplary visualizes the learning progress of this model.[0m ...
Process-1079647 processing [31mSlices along z-dire



Process-1079647 processing [31m<p align="center">     <img src="assets/emoji.png" alt="earthPT" width="150"/> </p>  # EarthPT  <p align="center">     <img src="assets/timeseries.png" alt="prediction" width="600"/> </p>  A simple repository for training time series large observation models.[0m ...
Process-1079647 processing [31mThis repository began its life as Andrej Karpathy's [nanoGPT](https://github.com/karpathy/nanoGPT), and has been altered so that it is usable for time series data.[0m ...
Process-1079647 processing [31m`train.py` reproduces [EarthPT-700M](https://arxiv.org/abs/2309.07207) when trained on 14B time series 'tokens' of ClearSky EO data within the TL UK National Grid tile.[0m ...
Process-1079647 processing [31mWhen run, `train.py` takes ~5 days to achieve Chinchilla 🐭  completion on a single 8xA100 40GB node.[0m ...
Process-1079647 processing [31mWithin `train.py` you will find a ~300-line boilerplate training loop and within `model.py` you will find a ~300-l



Process-1079647 processing [31m### Aspire Repository accompanying paper for modeling fine grained similarity between documents:   **Title**: "Multi-Vector Models with Textual Guidance for Fine-Grained Scientific Document Similarity"  **Authors**: Sheshera Mysore, Arman Cohan, Tom Hope  **Abstract**: We present a new scientific document similarity model based on matching fine-grained aspects of texts.[0m ...
Process-1079647 processing [31mTo train our model, we exploit a naturally-occurring source of supervision: sentences in the full-text of papers that cite multiple papers together (co-citations).[0m ...
Process-1079647 processing [31mSuch co-citations not only reflect close paper relatedness, but also provide textual descriptions of how the co-cited papers are related.[0m ...
Process-1079647 processing [31mThis novel form of textual supervision is used for learning to match aspects across papers.[0m ...
Process-1079647 processing [31mWe develop multi-vector representations w



Process-1079647 processing [31m# A Survey on the Role of Crowds in Combating Online Misinformation: Annotators, Evaluators, and Creators  A curated list of papers on "A Survey on the Role of Crowds in Combating Online Misinformation: Annotators, Evaluators, and Creators" ([Paper link](https://arxiv.org/abs/2310.02095))   ## Citation ```  @article{he2023survey,   title={A Survey on the Role of Crowds in Combating Online Misinformation: Annotators, Evaluators, and Creators},   author={He, Bing and Hu, Yibo and Lee, Yeon-Chang and Oh, Soyoung and Verma, Gaurav and Kumar, Srijan},   journal={arXiv preprint arXiv:2310.02095},   year={2023} }  ```   Online misinformation poses a global risk with significant real-world consequences.[0m ...
Process-1079647 processing [31mTo combat misinformation, current research relies on professionals like journalists and fact-checkers for annotating and debunking false information, while also developing automated machine learning methods for detecting mi



Process-1079647 processing [31m# Improving Deep Metric Learning by Divide and Conquer ## About  PyTorch implementation for the paper _Improving Deep Metric  Learning by Divide and Conquer_ accepted to **TPAMI** (Sep. 2021), which is our follow-up paper of [_Divide and Conquer the Embedding Space for Metric Learning (CVPR 2019)_](https://github.com/CompVis/metric-learning-divide-and-conquer)  **Links**: * arxiv: https://arxiv.org/abs/2109.04003 or * TPAMI early access: https://ieeexplore.ieee.org/document/9540303   ## Requirements  * PyTorch 1.1.0 * Faiss-GPU >= 1.5.0, [Link](https://github.com/facebookresearch/faiss) * albumentations >= 0.4.5, [Link](https://github.com/albumentations-team/albumentations)   ## Usage ### Training:  Training is done by calling `python train.py` and setting the respective params, all of which are listed and explained  in `/experiment/margin_loss_resnet50.py` (the default setup for all our experiments).[0m ...
Process-1079647 processing [31mThe params pr



Process-1079647 processing [31m# TAG-CF: Test-time Aggregation for CF  Source code for the paper **[How Does Message Passing Improve Collaborative Filtering?][0m ...
Process-1079647 processing [31m(https://arxiv.org/abs/2404.08660)** accepted at NeurIPS 2024.[0m ...
Process-1079647 processing [31m>by [Mingxuan Ju](https://scholar.google.com/citations?[0m ...
Process-1079647 processing [31muser=qNoO67AAAAAJ&hl=en&oi=ao), [William Shiao](https://scholar.google.com/citations?[0m ...
Process-1079647 processing [31muser=TIq-P5AAAAAJ&hl=en&oi=ao), [Zhichun Guo](https://scholar.google.com/citations?[0m ...
Process-1079647 processing [31muser=BOFfWR0AAAAJ&hl=en&oi=ao), [Fanny Ye](https://scholar.google.com/citations?[0m ...
Process-1079647 processing [31muser=egjr888AAAAJ&hl=en&oi=ao), [Yozen Liu](https://scholar.google.com/citations?[0m ...
Process-1079647 processing [31muser=i3U2JjEAAAAJ&hl=en&oi=ao), [Neil Shah](https://scholar.google.com/citations?[0m ...
Process-1079647 pr



Process-1079647 processing [31m# [AAAI 2024] NuScenes-QA  Official repository for the AAAI 2024 paper **[NuScenes-QA: A Multi-modal Visual Question Answering Benchmark for Autonomous Driving Scenario](https://arxiv.org/pdf/2305.14836.pdf)**.  ![0m ...
Process-1079647 processing [31m[DataConstruction](docs/data_construction.png)  ## :fire: News  - `2024.11.01`  CenterPoint feature released[0m ...
Process-1079647 processing [31m.[0m ...
Process-1079647 processing [31m- `2024.10.11`  Training and Testing code released. - `2023.12.09`  Our paper is accepted by AAAI 2024![0m ...
Process-1079647 processing [31m- `2023.09.04`  Our NuScenes-QA dataset v1.0 released.  ## :hourglass_flowing_sand: To Do  - [x] Release question & anwswer data - [x] Release visual feature - [x] Release training and testing code  ## :running: Getting Started  ### Data Preparation  We have released our question-answer annotations, please download it from [HERE](https://drive.google.com/drive/folders/1jIkICT2



Process-1079647 processing [31m# ModelSketchBook — Getting Started  [Paper](https://hci.stanford.edu/publications/2023/Lam_ModelSketching_CHI23.pdf) | [DOI](https://doi.org/10.1145/3544548.3581290) |  [Video](https://youtu.be/-zaeXENVTfk) | [Sample NB](https://github.com/StanfordHCI/ModelSketchBook/blob/main/example_nb/23_04_ModelSketchBook_example.ipynb) |  <a target="_blank" href="https://colab.research.google.com/github/StanfordHCI/ModelSketchBook/blob/main/example_nb/23_04_ModelSketchBook_example.ipynb">   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a>  <p align="center"> <img src=".[0m ...
Process-1079647 processing [31m/docs/media/ModelSketchBook.png" alt="ModelSketchBook logo" width="75%"> </p>  **ModelSketchBook** is a Python package introduced as part of an ACM CHI 2023 paper:  **Model Sketching: Centering Concepts in Early-Stage Machine Learning Model Design**.[0m ...
Process-1079647 processing [31m*Michelle S.[0m ...
Proc



Process-1079647 processing [31m# Keyword Transformer: A Self-Attention Model for Keyword Spotting  <img src="kwt.png" alt="drawing" width="200"/>  This is the official repository for the paper [Keyword Transformer: A Self-Attention Model for Keyword Spotting](https://arxiv.org/abs/2104.00769), presented at Interspeech 2021.[0m ...
Process-1079647 processing [31mConsider citing our paper if you find this work useful.  ``` @inproceedings{berg21_interspeech,   author={Axel Berg and Mark O’Connor and Miguel Tairum Cruz},   title={{Keyword Transformer: A Self-Attention Model for Keyword Spotting}},   year=2021,   booktitle={Proc.[0m ...
Process-1079647 processing [31mInterspeech 2021},   pages={4249--4253},   doi={10.21437/Interspeech.2021-1286} } ```  ## Setup  ### Download Google Speech Commands  There are two versions of the dataset, V1 and V2.[0m ...
Process-1079647 processing [31mTo download and extract dataset V2, run:  ```shell wget https://storage.googleapis.com/download.tens



Process-1079647 processing [31m# Low_Resource_KBP knowledge graph population in low resource conditions   The file "*Few-Shot_ED.json.zip*" is the ***FewEvent*** dataset for the paper accepted by WSDM 2020 ***["Meta-Learning with Dynamic-Memory-Based Prototypical Network for Few-Shot Event Detection"](https://arxiv.org/abs/1910.11621)***   ## Source of Raw Data * We first scale up the number of event types in existing datasets, including the [ACE-2005 corpus](http://projects.ldc.upenn.edu/ace/), and [TAC-KBP-2017 Event Track Data](https://tac.nist.gov/2017/KBP/Event/index.html)[0m ...
Process-1079647 processing [31m.[0m ...
Process-1079647 processing [31m* We then import and extend some new event types based on an [automatically-labeled event data](https://github.com/acl2017submission/event-data), from Freebase and Wikipedia, constrained to specific domains such as music, film, sports, education, etc.  ## Data Structure In "*Few-Shot_ED.json.zip*"，the key is "*event type label*", 



Process-1079647 processing [31m# Code search  This project contains the code to reproduce the experiments in the paper [Neural Code Search Revisited: Enhancing Code Snippet Retrieval through Natural Language Intent](https://arxiv.org/abs/2008.12193).[0m ...
Process-1079647 processing [31mIt implements retrieval systems for annotated code snippets: pairs of a code snippet and a short natural language description.[0m ...
Process-1079647 processing [31mOur pretrained models and datasets are hosted on Zenodo (https://zenodo.org/record/4001602).[0m ...
Process-1079647 processing [31mThe models and datasets will be downloaded automatically when calling `load_model`, `load_snippet_collection`, etc.[0m ...
Process-1079647 processing [31m(see the code examples below).[0m ...
Process-1079647 processing [31mIn addition, the project also implements some *code-only* retrieval models (BM25, NCS, UNIF) for snippet collections that do not come with descriptions.[0m ...
Process-1079647 proc



Process-1079647 processing [31m# NetCov NetCov is an open-source tool that can be used with [Batfish](https://github.com/batfish/batfish) to analyze test coverage for network configurations.[0m ...
Process-1079647 processing [31mGiven a set of Batfish queries, it analyzes which lines of configurations has/has not been covered.[0m ...
Process-1079647 processing [31mThe result can be used to assess the rigorousness of the test suite and help discover the blind spots.[0m ...
Process-1079647 processing [31mPlease refer to our [paper](https://www.usenix.org/conference/nsdi23/presentation/xu) for technical details.[0m ...
Process-1079647 processing [31mNetCov is written in Python and can be used in concert with [pybatfish](https://pybatfish.readthedocs.io/en/latest/notebooks/interacting.html), Batfish's Python API.[0m ...
Process-1079647 processing [31m[<img src="screenshot_demo_video.png"  width="500">](https://youtube.com/video/FcBD2LhxqOQ)  ## Features  NetCov supports coverage



Process-1079647 processing [31m# EntQA  This repo provides the code for our ICLR 2022 paper [EntQA: Entitly Linking as Question Answering](https://arxiv.org/pdf/2110.02369.pdf)  ## Setup  ``` conda create --name entqa python=3.8 conda activate entqa pip install -r requirements.txt conda install -c pytorch faiss-gpu cudatoolkit=11.0  ```  ## Download data & preprocess All the preprocessed data can be downloaded [here](https://drive.google.com/drive/folders/1DQvfjKOuOoUE3YcYrg2GIvODaOEZXMdH?[0m ...
Process-1079647 processing [31musp=sharing), you can skip following preprocess steps.[0m ...
Process-1079647 processing [31mOr preprocess by yourself:  1.[0m ...
Process-1079647 processing [31mDownload KILT wikipedia knowledge base [here](https://github.com/facebookresearch/KILT) and put it under a kb directory like /raw_kb/  \ 2.[0m ...
Process-1079647 processing [31mDownload BLINK pretrained retriever model [here](https://github.com/facebookresearch/BLINK)  \ 3.[0m ...
Process-1079



Process-1079647 processing [31m# Lightweight-Face-Detector-Pruning  Pruning Lightweight Face Detectors **EXTD** and **EResFD** using NNI's `FPGMPruner` and `L1NormPruner`.[0m ...
Process-1079647 processing [31mRepository updated in April 2024, for correction / completion of the paper-related materials, and for releasing scripts that facilitate the Android deployment of pruned EResFD models.  ## Project Structure  The repository is organized into 4 folders:  - `EXTD_Pytorch-master/`: Contains code and resources specific to the EXTD model. - `EResFD-main/`: Contains code and resources for the EResFD model. - `Pruned_Models/`: A collection of pre-pruned model weights (`.pth` files) for both EXTD and EResFD.[0m ...
Process-1079647 processing [31mThe pruned models that are evaluated in Tables 1 and 2 of our paper are provided, i.e. each of the EXTD, EResFD Face Detectors is pruned using one of the FPGM, L1 pruning techniques, for target pruning rates equal to 10%, 20%, 30%, 40% and 50%



Process-1079647 processing [31m# UniHD at TSAR-2022 Shared Task: Is Compute All We Need for Lexical Simplification?[0m ...
Process-1079647 processing [31mThis repository contains the modifications made by our team for the (winning) entry in the English shared task category.[0m ...
Process-1079647 processing [31mWe further include modifications made for Spanish and Portuguese, obtaining SotA in those languages as well.[0m ...
Process-1079647 processing [31m**Find the arXiv version of our paper here: https://arxiv.org/abs/2301.01764**  In case you find these results useful, please consider citing our work in addition to the shared task paper (see below).  ``` @article{aumiller-gertz-2023-unihd, author = {Aumiller, Dennis and Gertz, Michael}, title = {{UniHD at TSAR-2022 Shared Task: Is Compute All We Need for Lexical Simplification?}}[0m ...
Process-1079647 processing [31m, journal = {CoRR}, volume = {abs/2301.01764}, eprinttype = {arXiv}, eprint = {2301.01764}, url = {https://a



Process-1079647 processing [31m# Open Datasheets  The Open Datasheets framework is a simple, standardized no-code way to document datasets.[0m ...
Process-1079647 processing [31mIt is inspired by the concept of [datasheets for datasets](https://arxiv.org/abs/1803.09010) and [Understanding Machine Learning Practitioners' Data Documentation Perceptions, Needs, Challenges, and Desiderata](https://dl.acm.org/doi/10.1145/3555760) research papers.[0m ...
Process-1079647 processing [31mThe framework is designed to be both machine-readable and human-readable, serving as a tool for dataset creators to document their datasets and for dataset consumers to understand the datasets they are using.[0m ...
Process-1079647 processing [31mBy integrating directly with GitHub, the framework allows you to create, edit, and export your Open Datasheets directly to GitHub.[0m ...
Process-1079647 processing [31mIt leverages the widely used [Data Package](https://specs.frictionlessdata.io/data-package/



Process-1079647 processing [31m# fisher-information ## About the Project **For the most up-to-date work using this framework, and if you are interested in applying the framework to experimental design problems of your own, see the [experimental-design](https://github.com/James-Durant/experimental-design) repository**.[0m ...
Process-1079647 processing [31mThis repository contains the [code](/fisher-information), [data](/fisher-information/data) and [results](/fisher-information/results) for a framework for determining the maximum information gain and optimising experimental design in neutron reflectometry experiments using the Fisher information (FI).[0m ...
Process-1079647 processing [31mIn neutron reflectometry experiments, the FI can be analytically calculated and used to provide sub-second predictions of parameter uncertainties.[0m ...
Process-1079647 processing [31mThese uncertainties can influence real-time decisions about measurement angle, measurement time, contrast choi



Process-1079647 processing [31m# story-distiller  [![0m ...
Process-1079647 processing [31m[Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)  This project attempts to embed a story into a music playlist by sorting the playlist (i.e., sequencing it) so that the order of the music follows a narrative arc.[0m ...
Process-1079647 processing [31mThe music tracks are fitted to a fixed narrative template based on the output of a machine learning model, which itself distills each track down to its narrative essence.[0m ...
Process-1079647 processing [31mFor more information on narrative essence and how it generalizes to other forms of media, see *On the Distillation of Stories for Transferring Narrative Arcs in Collections of Independent Media* by Dylan R.[0m ...
Process-1079647 processing [31mAshley, Vincent Herrmann, Zachary Friggstad, and Jürgen Schmidhuber.   ## Installation  This project is implemented in [Python](https



Process-1079647 processing [31m# MLLM-Bench MLLM-Bench: Evaluating Multimodal LLMs with Per-sample Criteria <center>  ![0m ...
Process-1079647 processing [31m[Python 3.9+](https://img.shields.io/badge/Python-3.9+-lightblue) ![0m ...
Process-1079647 processing [31m[Pytorch 2.0](https://img.shields.io/badge/PyTorch-2.0+-lightblue) ![0m ...
Process-1079647 processing [31m[transformers](https://img.shields.io/badge/transformers-4.36.0.dev0%2B-lightblue) ![0m ...
Process-1079647 processing [31m[accelerate](https://img.shields.io/badge/accelerate-0.22+-lightblue) </center>  <p align="center">    📃  <a href="https://arxiv.org/abs/2311.13951" target="_blank">Paper</a> • 🌐  <a href="https://mllm-bench.llmzoo.com/" target="_blank">Website</a> • 🤗  <a href="huggingface.com" target="_blank">HuggingFace</a>    <p align="center"> <img src=".[0m ...
Process-1079647 processing [31m/image.png" alt="Data Composition" width="550" height="550">   ## 🌈  Update * **[2024.4.27]** V3 data, benchmar



Process-1079647 processing [31m# Sparse-Depth-Completion  This repo contains the implementation of our paper [Sparse and Noisy LiDAR Completion with RGB Guidance and Uncertainty](https://arxiv.org/abs/1902.05356) by [Wouter Van Gansbeke](https://github.com/wvangansbeke), Davy Neven, Bert De Brabandere and Luc Van Gool.[0m ...
Process-1079647 processing [31mIf you find this interesting or relevant to your work, consider citing:  ``` @inproceedings{wvangansbeke_depth_2019,     author={Van Gansbeke, Wouter and Neven, Davy and De Brabandere, Bert and Van Gool, Luc},     booktitle={2019 16th International Conference on Machine Vision Applications (MVA)},     title={Sparse and Noisy LiDAR Completion with RGB Guidance and Uncertainty},     year={2019},     pages={1-6},     organization={IEEE} } ```  ## License  This software is released under a creative commons license which allows for personal and research use only.[0m ...
Process-1079647 processing [31mFor a commercial license please c



Process-1079647 processing [31m# Scaffolding Learning Regime (SLR)  [![0m ...
Process-1079647 processing [31m[paper](https://img.shields.io/badge/paper-52b69a?[0m ...
Process-1079647 processing [31mstyle=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2206.13263) [![0m ...
Process-1079647 processing [31m[weights](https://img.shields.io/badge/weights-34a0a4?[0m ...
Process-1079647 processing [31mstyle=for-the-badge&logo=DocuSign&logoColor=white)](#pretrained-models) [![0m ...
Process-1079647 processing [31m[presentation](https://img.shields.io/badge/presentation-168aad?[0m ...
Process-1079647 processing [31mstyle=for-the-badge&logo=youtube&logoColor=white)](https://www.youtube.com/watch?[0m ...
Process-1079647 processing [31mv=F4sLbbMsoHw) [![0m ...
Process-1079647 processing [31m[cite](https://img.shields.io/badge/bibtex-1a759f?[0m ...
Process-1079647 processing [31mstyle=for-the-badge&logo=overleaf&logoColor=white)](#cite)   A PyTorch implementation



Process-1079647 processing [31m# PrivacyFL: A simulator for privacy-preserving and secure federated learning  This repository contains the source code for running a privacy perserving federated learning simulator.[0m ...
Process-1079647 processing [31mThe source code is currently set up for the configuration of three clients performing secure and differentially private federated learning using logistic regresion on the MNIST dataset.[0m ...
Process-1079647 processing [31mThis library, however, is meant to be modified so as to simulate your own secure federated machine learning configuration.[0m ...
Process-1079647 processing [31mWe hope that this simulation can help users decide whether it is beneficial for them to participate in differentially-private federated learning for a given differentially private algorithm.  ## UPDATE : Paper accepted at the 29TH ACM INTERNATIONAL CONFERENCE ON INFORMATION AND KNOWLEDGE MANAGEMENT  Paper and Video Link : https://dl.acm.org/doi/10.1145/3



Process-1079647 processing [31m [0m ...
Process-1079647 processing [31m# VN-EGNN: E(3)-Equivariant Graph Neural Networks with Virtual Nodes Enhance Protein Binding Site Identification  [![0m ...
Process-1079647 processing [31m[Open in HuggingFace](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/ml-jku/vnegnn) [![0m ...
Process-1079647 processing [31m[](https://img.shields.io/badge/paper-arxiv2310.06763-red?[0m ...
Process-1079647 processing [31mstyle=plastic&logo=GitBook)](https://arxiv.org/abs/2404.07194) [![0m ...
Process-1079647 processing [31m[](https://img.shields.io/badge/model-pink?[0m ...
Process-1079647 processing [31mstyle=plastic&logo=themodelsresource)](https://huggingface.co/fses91/VNEGNN-MODEL) [![0m ...
Process-1079647 processing [31m[](https://img.shields.io/badge/project_page-blue?[0m ...
Process-1079647 processing [31mstyle=plastic&logo=internetcomputer)](https://linktr.ee/vnegnn)  #



Process-1079647 processing [31m# Awesome Neural Tree Papers <img class="emoji" alt=":art:" height="30" width="30" src="tree.png"> Selected papers and possible corresponding codes in our review paper **"A Survey of Neural Trees" [[arXiv Version]](https://arxiv.org/abs/2209.03415)**  *If you find there is a missed paper or a possible mistake in our survey, please feel free to email me or pull a request here.[0m ...
Process-1079647 processing [31mI am more than glad to receive your advice.[0m ...
Process-1079647 processing [31mThanks![0m ...
Process-1079647 processing [31m*  ## Introduction Neural trees (NTs) refer to a school of methods that combine neural networks (NNs) and decision trees (DTs), for which we present a comprehensive review in this survey.[0m ...
Process-1079647 processing [31mOur keynote is to identify how these approaches enhance the model interpretability and suggest possible solutions to the remaining challenges.[0m ...
Process-1079647 processing [31mBeside



Process-1079647 processing [31m# Air-Writing **A CNN Based Framework for Unistroke Numeral Recognition in Air-Writing.**  *Accepted in The IEEE International Conference on Frontiers in Handwriting Recognition (ICFHR) 2018.*  **[[arXiv]](https://arxiv.org/abs/2303.07989)** <img align='right' height='100' src='https://github.com/prasunroy/air-writing/blob/master/assets/logo.png' />  ![0m ...
Process-1079647 processing [31m[badge](https://github.com/prasunroy/air-writing/blob/master/assets/badge_1.svg) ![0m ...
Process-1079647 processing [31m[badge](https://github.com/prasunroy/air-writing/blob/master/assets/badge_2.svg)  ## Installation #### Step 1: Install [Anaconda](https://www.anaconda.com/download/) distribution of python 2.7+ or 3.5+ (recommended) #### Step 2: Update Anaconda ``` conda update conda conda update anaconda ``` #### Step 3: Install dependencies ``` conda install theano pip install keras numpy opencv-python pyqt5 ``` >To switch backend from "tensorflow" (default) to



Process-1079647 processing [31m# megaman: Manifold Learning for Millions of Points  <img src="https://raw.githubusercontent.com/mmp2/megaman/master/doc/images/word2vec_rmetric_plot_no_digits.png" height=200><img src="https://raw.githubusercontent.com/mmp2/megaman/master/doc/images/spectra_D4000.png" height=200><img src="https://raw.githubusercontent.com/mmp2/megaman/master/doc/images/spectra_Halpha.png" height=200>  [![0m ...
Process-1079647 processing [31m[Anaconda-Server Badge](https://anaconda.org/conda-forge/megaman/badges/downloads.svg)](https://anaconda.org/conda-forge/megaman) [![0m ...
Process-1079647 processing [31m[build status](http://img.shields.io/travis/mmp2/megaman/master.svg?[0m ...
Process-1079647 processing [31mstyle=flat)](https://travis-ci.org/mmp2/megaman) [![0m ...
Process-1079647 processing [31m[version status](http://img.shields.io/pypi/v/megaman.svg?[0m ...
Process-1079647 processing [31mstyle=flat)](https://pypi.python.org/pypi/megaman) [![0m ...
P



Process-1079647 processing [31m# Words as Gatekeepers  License: [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/)  **Authors**: Li Lucy, Jesse Dodge, David Bamman, Katherine A.[0m ...
Process-1079647 processing [31mKeith  **[Paper](https://arxiv.org/abs/2212.09676)**, **[Blog post](https://blog.allenai.org/words-as-gatekeepers-measuring-discipline-specific-terms-and-meanings-in-scholarly-publications-718dc56d08a5)**  <p align="center"> <img src="image.png" width="75%" > </p>  **Abstract**: Scholarly text is often laden with jargon, or specialized language that can facilitate efficient in-group communication within fields but hinder understanding for out-groups.[0m ...
Process-1079647 processing [31mIn this work, we develop and validate an interpretable approach for measuring scholarly jargon from text.[0m ...
Process-1079647 processing [31mExpanding the scope of prior work which focuses on word types, we use word sense induction to also identify words that are wide



Process-1079647 processing [31m# Systematic-Generalization-via-Meaningful-Learning This repository is for the paper [Revisit Systematic Generalization via Meaningful Learning](https://aclanthology.org/2022.blackboxnlp-1.6).[0m ...
Process-1079647 processing [31m*In Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP*, pages 62–79, Abu Dhabi, United Arab Emirates (Hybrid).[0m ...
Process-1079647 processing [31mAssociation for Computational Linguistics.[0m ...
Process-1079647 processing [31m[[arXiv](https://arxiv.org/abs/2003.06658)] [[Poster](https://www.shininglab.ai/assets/posters/Revisit%20Systematic%20Generalization%20via%20Meaningful%20Learning.pdf)]  ## Directory + **main/config.py** - Configurations + **main/res** - Resources including model check points, datasets, experiment records, and results + **main/src** - Source code including model structures and utility functions ``` Systematic-Generalization-via-Meaningful-Learning



Process-1079647 processing [31m# NELA-GT-2019  This repository contain examples of how to use the NELA-GT-2019 data set with Python 3.[0m ...
Process-1079647 processing [31m__Download the dataset from here__: https://dataverse.harvard.edu/dataset.xhtml?[0m ...
Process-1079647 processing [31mpersistentId=doi:10.7910/DVN/O7FWPO  __For more details about this dataset, check the paper__: https://arxiv.org/abs/2003.08444  If you use this dataset in your work, please cite us as follows: <br> ``` @misc{     gruppi2020nelagt2019,     title={NELA-GT-2019: A Large Multi-Labelled News Dataset for The Study of Misinformation in News Articles},     author={Maurício Gruppi and Benjamin D.[0m ...
Process-1079647 processing [31mHorne and Sibel Adalı},     year={2020},     eprint={2003.08444},     archivePrefix={arXiv},     primaryClass={cs.CY} } ``` ## Data  Metadata|| ---|--- Dataset name|`NELA-GT-2019` Formats|`Sqlite3`,`JSON` No. of articles|`1118821` No. of sources|`261` Collection period|`



Process-1079647 processing [31m## Ontological knowledge infusion in embedding-Large Language Models  The code of this repository implements a novel approach to improve an embedding-Large Language  Model (embedding-LLM) of interest by infusing the knowledge formalized by a reference ontology: ontological  knowledge infusion aims at boosting the ability of the considered embedding-LLM to effectively model  the knowledge domain described by the infused ontology.[0m ...
Process-1079647 processing [31mThe ontological knowledge infusion approach is described into details by the article  [Towards Ontology-Enhanced Representation Learning for Large Language Models](https://arxiv.org/abs/2405.20527).[0m ...
Process-1079647 processing [31mHereinafter we describe the procedure to follow to infuse the disease knowledge formalized by the  [disease ontology MONDO](https://mondo.monarchinitiative.org/) in four widespread embedding-LLMs  and evaluate the ontology-enhanced embedding-LLMs against t



Process-1079647 processing [31m  [0m ...
Process-1079647 processing [31m![0m ...
Process-1079647 processing [31m[Version](https://badge.fury.io/py/karateclub.svg?[0m ...
Process-1079647 processing [31mstyle=plastic)  ![0m ...
Process-1079647 processing [31m[License](https://img.shields.io/github/license/benedekrozemberczki/karateclub.svg) [![0m ...
Process-1079647 processing [31m[repo size](https://img.shields.io/github/repo-size/benedekrozemberczki/karateclub.svg)](https://github.com/benedekrozemberczki/karateclub/archive/master.zip)  [![0m ...
Process-1079647 processing [31m[Arxiv](https://img.shields.io/badge/ArXiv-2003.04819-orange.svg)](https://arxiv.org/abs/2003.04819) [![0m ...
Process-1079647 processing [31m[build badge](https://github.com/benedekrozemberczki/karateclub/workflows/CI/badge.svg)](https://github.com/benedekrozemberczki/karateclub/actions?[0m ...
Process-1079647 processing [31mquery=workflow%3ACI)  [![0m ...
Process-1079647 processing [31m[covera



Process-1079647 processing [31m# Ref-NMS Official codebase for AAAI 2021 paper ["Ref-NMS: Breaking Proposal Bottlenecks in Two-Stage Referring Expression Grounding"](https://arxiv.org/abs/2009.01449).  ## Prerequisites The following dependencies should be enough.[0m ...
Process-1079647 processing [31mSee [environment.yml](environment.yml) for complete environment settings. - python 3.7.6 - pytorch 1.1.0 - torchvision 0.3.0 - tensorboard 2.1.0 - spacy 2.2.3  ## Data Preparation Follow instructions in `data/README.md` to setup `data` directory.[0m ...
Process-1079647 processing [31mRun following script to setup `cache` directory: ``` sh scripts/prepare_data.sh ``` This should generate following files under `cache` directory: - vocabulary file: `std_vocab_<dataset>_<split_by>.txt` - selected GloVe feature: `std_glove_<dataset>_<split_by>.npy` - referring expression database: `std_refdb_<dataset>_<split_by>.json` - critical objects database: `std_ctxdb_<dataset>_<split_by>.json`   ## 



Process-1079647 processing [31m [0m ...
Process-1079647 processing [31m####################################################################  Source code and datasets of Para-DPMM model for single cell transcriptomic clustering to reproduce results in paper "[Parallel Clustering of Single Cell Transcriptomic Data with Split-Merge Sampling on Dirichlet Process Mixtures](https://arxiv.org/pdf/1812.10048.pdf)", Author: Tiehang Duan; José P.[0m ...
Process-1079647 processing [31mPinto; Xiaohui Xie;   # 1.[0m ...
Process-1079647 processing [31mData Preparation:  In the datasets folder, we included the mat files that are used in the paper's experiment part.[0m ...
Process-1079647 processing [31mYou can also prepare your own data following the procedures below.  ## 1.1  Download raw datasets from 10X genomics website (https://support.10xgenomics.com/single-cell-gene-expression/datasets) and store the files in the datasets folder;  ## 1.2  Follow the comments in "data_preparation.m" to



Process-1079647 processing [31m## Fully Convolutional Instance-aware Semantic Segmentation  The major contributors of this repository include [Haozhi Qi](https://github.com/Oh233), [Yi Li](https://github.com/liyi14), [Guodong Zhang](https://github.com/gd-zhang), [Haochen Zhang](https://github.com/Braininvat), [Jifeng Dai](https://github.com/daijifeng001), and [Yichen Wei](https://github.com/YichenWei).  ### Introduction  **FCIS** is a fully convolutional end-to-end solution for instance segmentation, which won the first place in COCO segmentation challenge 2016.[0m ...
Process-1079647 processing [31mFCIS is initially described in a [CVPR 2017 spotlight paper](https://arxiv.org/abs/1611.07709).[0m ...
Process-1079647 processing [31mIt is worth noticing that: * FCIS provides a simple, fast and accurate framework for instance segmentation. * Different from [MNC](https://github.com/daijifeng001/MNC), FCIS performs instance mask estimation and categorization jointly and simultanously, 



Process-1079647 processing [31m# BOOKSUM: A Collection of Datasets for Long-form Narrative Summarization Authors: [Wojciech Kryściński](https://twitter.com/iam_wkr), [Nazneen Rajani](https://twitter.com/nazneenrajani), [Divyansh Agarwal](https://twitter.com/jigsaw2212), [Caiming Xiong](https://twitter.com/caimingxiong), [Dragomir Radev](http://www.cs.yale.edu/homes/radev/)  ## Introduction The majority of available text summarization datasets include short-form source documents that lack long-range causal and temporal dependencies, and often contain strong layout and stylistic biases.[0m ...
Process-1079647 processing [31mWhile relevant, such datasets will offer limited challenges for future generations of text summarization systems.[0m ...
Process-1079647 processing [31mWe address these issues by introducing BookSum, a collection of datasets for long-form narrative summarization.[0m ...
Process-1079647 processing [31mOur dataset covers source documents from the literature domai



Process-1079647 processing [31m# MeanSum: A Model for Unsupervised Neural Multi-Document Abstractive Summarization  Corresponding paper, accepted to ICML 2019: [https://arxiv.org/abs/1810.05739](https://arxiv.org/abs/1810.05739).  ## Requirements  Main requirements: - python 3 - torch 0.4.0  Rest of python packages in ```requirements.txt```.[0m ...
Process-1079647 processing [31mTested in Docker, image = ```pytorch/pytorch:0.4_cuda9_cudnn7```.  ## General setup   Execute inside ```scripts/```:  ##### Create directories that aren't part of the Git repo (checkpoints/, outputs/):  ``` bash setup_dirs.sh ```  ##### Install python packages:  ``` bash install_python_pkgs.sh ```  ##### The default parameters for Tensorboard(x?)[0m ...
Process-1079647 processing [31mcause texts from writer.add_text() to not show up.[0m ...
Process-1079647 processing [31mUpdate by:  ``` python update_tensorboard.py ```    ## Downloading data and pretrained models  ### Data  1.[0m ...
Process-1079647 pro



Process-1079647 processing [31m# Impact of Leakage on Data Harmonization in Machine Learning Pipelines in Class Imbalance Across Sites  ## About  The Forschungszentrum Jülich Machine Learning Library  It is currently being developed and maintained at the [Applied Machine Learning](https://www.fz-juelich.de/en/inm/inm-7/research-groups/applied-machine-learning-aml) group at [Forschungszentrum Juelich](https://www.fz-juelich.de/en), Germany.   ## Overview  **PrettYharmonize** is a Python package developed to address data leakage in the harmonization of biomedical datasets with site-specific variability, particularly under scenarios where class balance differs across data collection sites.[0m ...
Process-1079647 processing [31mTraditional harmonization methods like ComBat, while widely used, often struggle with data leakage, leading to compromised model performance.[0m ...
Process-1079647 processing [31mPrettYharmonize introduces a novel approach that leverages "pretending" target la



# Scorer.py

In [17]:
import argparse
import json
import os
from collections import defaultdict
from functools import reduce
from webanno_tsv import webanno_tsv_read_file, Document, Annotation
from typing import List, Union
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

LABELS = [
    'CONFERENCE',
    'DATASET',
    'EVALMETRIC',
    'LICENSE',
    'ONTOLOGY',
    'PROGLANG',
    'PROJECT',
    'PUBLICATION',
    'SOFTWARE',
    'WORKSHOP'
]

def to_char_bio(src_path: str, ref_path: str) -> List[List[str]]:
    ref_doc = webanno_tsv_read_file(ref_path)
    # Parse the WebAnno TSV file
    doc = webanno_tsv_read_file(src_path)
    # Initialize a list to store character-level BIO tags
    bio_tags_list = []
    for target_label in LABELS:
        bio_tags = ['#'] * len(ref_doc.text)  # Default to '#' for all characters
        # Pick interested sentences and default them to 'O'
        for annotation in ref_doc.annotations:
            label = annotation.label
            if label != target_label:
                continue
            sentences = ref_doc.annotation_sentences(annotation)
            for sentence in sentences:
                tokens = ref_doc.sentence_tokens(sentence)
                start_char, end_char = tokens[0].start, tokens[-1].end
                bio_tags[start_char:end_char] = ['O'] * (end_char-start_char)

        for annotation in doc.annotations:
            label = annotation.label
            if label != target_label:
                continue

            start_token, end_token = annotation.tokens[0], annotation.tokens[-1]
            start_char = start_token.start
            end_char = end_token.end
            # Sanity check
            if ref_doc.text[start_char:end_char] != annotation.text:
                msg = f"ERROR: src: {src_path}, annotated '{annotation.text}', text: '{ref_doc.text[start_char:end_char]}'"
                print(msg)

            if 'I-' in bio_tags[start_char]:
                # Overlapping, it's annotated by another annotations, we connect them as one annotations
                pass
            else:
                if bio_tags[start_char] != '#':
                    # Assign BIO tags to characters in the entity span
                    bio_tags[start_char] = f'B-{label}'  # Beginning of the entity

            for i in range(start_char + 1, end_char):
                if bio_tags[i] != '#':
                    bio_tags[i] = f'I-{label}'  # Inside the entity

        # Remove unannotated sentences from bio list.
        bio_tags = [x for x in filter(lambda x: x != '#', bio_tags)]
        bio_tags_list.append(bio_tags)

    return bio_tags_list

In [15]:
def flatten(lst):
    return reduce(lambda x, y: x + y, lst)

In [18]:
import os 

print(os.getcwd())
ref_dir = '../results/Meta-Llama-3-8B-Instruct/test_unlabeled/'
pred_dir = '../results/Meta-Llama-3-8B-Instruct/prompt-0/'
score_dir = '../results/scores/'

os.makedirs(pred_dir, exist_ok=True)
os.makedirs(score_dir, exist_ok=True)

ref_file_names = sorted([fp for fp in os.listdir(ref_dir) if os.path.isfile(f'{ref_dir}/{fp}') and fp.endswith('.tsv')])

if len(ref_file_names) == 0:
    raise Exception("ERROR: No reference files found, configuration error?")

all_ref_bio_tags_list = []
for ref_file_name in ref_file_names:
    src_path = os.path.join(ref_dir, ref_file_name)
    ref_path = src_path
    all_ref_bio_tags_list.append(to_char_bio(src_path, ref_path))

pred_file_names = sorted([fp for fp in os.listdir(pred_dir) if os.path.isfile(f'{pred_dir}/{fp}') and fp.endswith('.tsv')])
all_pred_bio_tags_list = []
for idx, ref_file_name in enumerate(ref_file_names):
    try:
        src_path = os.path.join(pred_dir, ref_file_name)
        ref_path = os.path.join(ref_dir, ref_file_name)
        all_pred_bio_tags_list.append(to_char_bio(src_path, ref_path))
    except FileNotFoundError:
        nbr_labels = len(all_ref_bio_tags_list[idx])
        assert nbr_labels == len(LABELS), "ERROR: reference tags doesn't have ${len(LABELS)} labels."
        pred = []
        for label_idx in range(nbr_labels):
            pred.append(['O'] * len(all_ref_bio_tags_list[idx][label_idx]))

        print(f"WARN: {ref_file_name} is missing, fill 'O' list as default prediction")
        all_pred_bio_tags_list.append(pred)
# Sanity checking
for idx, (ref_list, pred_list) in enumerate(zip(all_ref_bio_tags_list, all_pred_bio_tags_list)):
    for label_idx, (ref, pred) in enumerate(zip(ref_list, pred_list)):
        assert len(ref) == len(pred), f'ERROR: {ref_file_names[idx]}, label: {LABELS[label_idx]}, reference length: {len(ref)}, prediction length: {len(pred)}'

scores = {}
################################################################################
# Consider whole dataset
################################################################################
ref_bio_tags_list = flatten(flatten(all_ref_bio_tags_list))
pred_bio_tags_list = flatten(flatten(all_pred_bio_tags_list))

accuracy = accuracy_score(ref_bio_tags_list, pred_bio_tags_list)
scores['overall_accuracy'] = accuracy
average = 'macro'
ref_bio_tags_list = flatten(flatten(all_ref_bio_tags_list))
pred_bio_tags_list = flatten(flatten(all_pred_bio_tags_list))

f1 = f1_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
precision = precision_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
recall = recall_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
scores[f"overall_{average}_precision"] = precision
scores[f"overall_{average}_recall"] = recall
scores[f"overall_{average}_f1"] = f1


################################################################################
# For each class
################################################################################
label_to_ref_bio_tags_list = defaultdict(list)
label_to_pred_bio_tags_list = defaultdict(list)
for ref_bio_tags_list, pred_bio_tags_list in zip(all_ref_bio_tags_list, all_pred_bio_tags_list):
    if len(ref_bio_tags_list) != len(LABELS):
        print('ERROR: ref bio tags list')
    if len(pred_bio_tags_list) != len(LABELS):
        print('ERROR: pred bio tags list')

    for label, ref_bio_tags, pred_bio_tags in zip(LABELS, ref_bio_tags_list, pred_bio_tags_list):
        label_to_ref_bio_tags_list[label].extend(ref_bio_tags)
        label_to_pred_bio_tags_list[label].extend(pred_bio_tags)
        if len(label_to_ref_bio_tags_list[label]) != len(label_to_pred_bio_tags_list[label]):
            print('ERROR: label_to_ref_pred_bio_tags')


for label in label_to_ref_bio_tags_list.keys():
    ref_bio_tags_list = label_to_ref_bio_tags_list[label]
    pred_bio_tags_list = label_to_pred_bio_tags_list[label]
    accuracy = accuracy_score(ref_bio_tags_list, pred_bio_tags_list)
    f1 = f1_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
    precision = precision_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
    recall = recall_score(ref_bio_tags_list, pred_bio_tags_list, average=average)
    scores[f"{label}_{average}_precision"] = precision
    scores[f"{label}_{average}_recall"] = recall
    scores[f"{label}_{average}_f1"] = f1

print("Scores:\n", json.dumps(scores, indent=2))

with open(os.path.join(score_dir, 'Meta-Llama-3-8B-Instruct-scores.json'), 'w') as fd:
    json.dump(scores, fd, indent=2)

/home/ann/fiz-ddb/notebook/readme2kg-exp/src
WARN: 231sm_Low_Resource_KBP_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: ARM-software_keyword-transformer_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: Cardio-AI_3d-mri-domain-adaptation_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: ChopinSharp_ref-nms_main_README.md.tsv is missing, fill 'O' list as default prediction
WARN: James-Durant_fisher-information_main_README.md.tsv is missing, fill 'O' list as default prediction
WARN: MELALab_nela-gt-2019_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: allenai_aspire_main_README.md.tsv is missing, fill 'O' list as default prediction
WARN: alpiges_LinConGauss_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: anonymous-submission-22_dejavu_master_README.md.tsv is missing, fill 'O' list as default prediction
WARN: aspiaspace_earthpt_main_README.md.tsv is missing, 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Scores:
 {
  "overall_accuracy": 0.9564671580140925,
  "overall_macro_precision": 0.1912934316028185,
  "overall_macro_recall": 0.2,
  "overall_macro_f1": 0.19554985200670627,
  "CONFERENCE_macro_precision": NaN,
  "CONFERENCE_macro_recall": NaN,
  "CONFERENCE_macro_f1": NaN,
  "DATASET_macro_precision": 0.3265321047242943,
  "DATASET_macro_recall": 0.3333333333333333,
  "DATASET_macro_f1": 0.329897668920167,
  "EVALMETRIC_macro_precision": NaN,
  "EVALMETRIC_macro_recall": NaN,
  "EVALMETRIC_macro_f1": NaN,
  "LICENSE_macro_precision": NaN,
  "LICENSE_macro_recall": NaN,
  "LICENSE_macro_f1": NaN,
  "ONTOLOGY_macro_precision": NaN,
  "ONTOLOGY_macro_recall": NaN,
  "ONTOLOGY_macro_f1": NaN,
  "PROGLANG_macro_precision": NaN,
  "PROGLANG_macro_recall": NaN,
  "PROGLANG_macro_f1": NaN,
  "PROJECT_macro_precision": NaN,
  "PROJECT_macro_recall": NaN,
  "PROJECT_macro_f1": NaN,
  "PUBLICATION_macro_precision": 0.3143107801600548,
  "PUBLICATION_macro_recall": 0.3333333333333333,
  "PUBLIC

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
