In [1]:
from operator import index
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset
data_files = {
    "train": "../data/dblp/pairwise_dataset_train.csv",
    "test": "../data/dblp/pairwise_dataset_test.csv",
}
dataset = load_dataset('csv', data_files=data_files)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset['train']

Dataset({
    features: ['id', 'title', 'abstract', 'soft_labels_l1', 'relevant_children_l1', 'reasoning_l1', 'soft_labels_l2', 'relevant_children_l2', 'reasoning_l2', 'soft_labels_l3', 'relevant_children_l3', 'reasoning_l3'],
    num_rows: 1000
})

In [4]:
columns_to_select = ['id', 'title', 'abstract']
dataset['train'] = dataset['train'].select_columns(columns_to_select)
dataset['test'] = dataset['test'].select_columns(columns_to_select)

In [5]:
def dblp_format(example):
    return f"""
    **{example['title']}**
    {example['abstract']}
    """

print(dblp_format(dataset['train'][0]))


    **Human actions recognition from streamed Motion Capture**
    This paper introduces a new method for streamed action recognition using Motion Capture (MoCap) data. First, the histograms of action poses, extracted from MoCap data, are computed according to Hausdorf distance. Then, using a dynamic programming algorithm and an incremental histogram computation, our proposed solution recognizes actions in real time from streams of poses. The comparison of histograms for recognition was achieved using Bhattacharyya distance. Furthermore, the learning phase has remained very efficient with respect to both time and complexity. We have shown the effectiveness of our solution by testing it on large datasets, obtained from animation databases. In particular, we were able to achieve excellent recognition rates that have outperformed the existing methods.
    


In [18]:
import json
from src.taxonomy import Taxonomy

path_to_hierarchy = "../data/dblp/acm_ccs_hierarchy.json"
path_to_description = "../data/dblp/label_description.json"

def generate_taxonomy_from_json(path_to_hierarchy, path_to_description):
    with open(path_to_hierarchy) as json_file:
        hierarchy = json.load(json_file)

    with open(path_to_description) as json_file:
        description = json.load(json_file)

    taxonomy = Taxonomy()

    def create_subtree(h, path_so_far):
        for node in h:
          path_so_far.append(node["label"])
          taxonomy.add_node(path_so_far, description.get(node["label"], node["label"]))
          create_subtree(node["children"], path_so_far)
          path_so_far.pop()

    create_subtree(hierarchy, [])
    return taxonomy

taxonomy = generate_taxonomy_from_json(path_to_hierarchy, path_to_description)


In [19]:
taxonomy.print_tree()

- root (Depth: 0): Root of taxonomy
    - General and reference (Depth: 1): Categorizes broad computing topics, including foundational concepts, educational materials, and reference works.
        - Document types (Depth: 2): Classifies formats like journals, theses, and conference papers used in academic and technical communication.
            - Surveys and overviews (Depth: 3): Summarizes existing research on a topic to provide context, trends, and gaps in a field.
            - Reference works (Depth: 3): Compiles authoritative collections of information, such as dictionaries, encyclopedias, and handbooks.
            - General conference proceedings (Depth: 3): Aggregates peer-reviewed papers presented at academic or industry conferences.
            - Biographies (Depth: 3): Documents the lives and contributions of individuals influential in computing history or research.
            - General literature (Depth: 3): Includes books, essays, and critiques on computing topics beyond

In [22]:
%reload_ext autoreload

In [None]:
# sample tag one case one level
from src.tagging.n_level_tagging import  choose_intents, tag_n_level
response_final = await tag_n_level(dblp_format(dataset['train'][0]), taxonomy, model='gpt-4.1-nano')

In [77]:
import asyncio
from aiolimiter import AsyncLimiter
from openai import RateLimitError
from backoff import on_exception, expo
from src.taxonomy import Taxonomy
# throttle settings
MAX_CONCURRENT = 5
sem = asyncio.Semaphore(MAX_CONCURRENT)
limiter = AsyncLimiter(60, 60)  # 60 calls per 60 seconds

@on_exception(expo, RateLimitError, max_time=60)
async def safe_tag(example, taxonomy: Taxonomy):
    fmt = dblp_format(example)
    id = example["id"]
    response = await tag_n_level(fmt, taxonomy, model="gpt-4o")
    return {"id": id, "example": example, "response": response}

async def get_tags(dataset, samples, taxonomy: Taxonomy,  partition = "train"):
    # allow either an int (take-first-N) or an iterable of indices
    idx = range(samples) if isinstance(samples, int) else samples

    async def worker(example):
        async with sem, limiter:
            return await safe_tag(example, taxonomy)

    tasks = []
    for i in idx:
        tasks.append(asyncio.create_task(worker(dataset[partition][i])))

    return await asyncio.gather(*tasks)


In [89]:
train_tags = await get_tags(dataset, range(2), taxonomy, partition="train")
test_tags = await get_tags(dataset, range(1), taxonomy, partition = "test")

In [93]:
tagged_files_train = './tagged/train.json'
tagged_files_test = './tagged/test.json'

with open(tagged_files_train, 'w') as f:
    json.dump(train_tags, f)
with open(tagged_files_test, 'w') as f:
    json.dump(test_tags, f)





In [96]:
import json

def normalize_soft_targets(candidates):
    """
    Given a list of candidate dicts with 'label' and 'confidence',
    return a dict mapping each label to a normalized confidence sum=1.
    """
    total = sum(c['confidence'] for c in candidates)
    if total == 0:
        # fallback: uniform distribution
        return {c['label']: 1.0 / len(candidates) for c in candidates}
    return [
        {
            "label" : c['label'],
            "confidence" : c['confidence'] / total,
            "rationale": c['rationale']
         } for c in candidates
    ]


def build_input(title, abstract, parent_path):
    """
    Construct the model input string from title, abstract, and previous labels path.
    """
    parts = [f"TITLE: {title}", f"ABSTRACT: {abstract}"]
    if parent_path:
        chain = ' > '.join(parent_path)
        parts.append(f"PREVIOUS_LABELS: {chain}")
    return "\n".join(parts)


def flatten_example(record):
    """
    Flatten a single nested record into a list of per-level dicts:
    each dict contains paper_id, level, input, label, soft_targets, and parent_path.
    """
    example = record['example']
    response = record['response']
    title = example.get('title', '')
    abstract = example.get('abstract', '')
    flattened = []

    def recurse(resp, parent_path, level):
        label = resp.get('prediction')
        candidates = resp.get('candidates', [])
        rationale = resp.get('rationale')
        soft_targets = normalize_soft_targets(candidates)
        inp = build_input(title, abstract, parent_path)
        flattened.append({
            'paper_id': example.get('id'),
            'level': level,
            'input': inp,
            'label': label,
            'rationale': rationale,
            'soft_targets': soft_targets,
            'parent_path': parent_path.copy()
        })
        children = resp.get('children')
        if children and isinstance(children, dict) and children.get('prediction') is not None:
            recurse(children, parent_path + [label], level + 1)

    recurse(response, [], 1)
    return flattened


def flatten_records(data):
    """
    Given a list of nested records (from json.load), return a flat list of per-level examples.
    """
    all_flat = []
    for rec in data:
        all_flat.extend(flatten_example(rec))
    return all_flat

# Example usage in a notebook:
#
# import json
#
# with open('nested.json') as f:
#     data = json.load(f)
#
# flat = flatten_records(data)
#
# # write to JSONL
# with open('nested_flattened.jsonl', 'w') as out:
#     for rec in flat:
#         out.write(json.dumps(rec) + '\n')


In [97]:
import json

def flatten_examples(record_file):
    print("Record file:", record_file)
    records = json.load(open(record_file))
    # print(json.dumps(flatten_records(records), indent=4))
    output_file_name = record_file.split('.json')[0] + '_flat.json'
    print(output_file_name)
    flattened = flatten_records(records)
    with open(output_file_name, 'w') as f:
        json.dump(flattened, f)

flatten_examples(tagged_files_train)
flatten_examples(tagged_files_test)


Record file: ./tagged/train.json
./tagged/train_flat.json
Record file: ./tagged/test.json
./tagged/test_flat.json


In [99]:
train_flat = './tagged/train_flat.json'
with open(train_flat, 'r') as f:
    train_flat = json.load(f)

test_flat = './tagged/test_flat.json'
with open(test_flat, 'r') as f:
    test_flat = json.load(f)

from src.tagging.evaluation import calculate_top_k_accuracy
calculate_top_k_accuracy(train_flat, test_flat)


0.0

In [112]:
# student model taggging
import asyncio
from aiolimiter import AsyncLimiter
from openai import RateLimitError
from backoff import on_exception, expo
from src.taxonomy import Taxonomy
from src.tagging.n_level_tagging_simple import  tag_n_level as simple_tagger

# throttle settings
MAX_CONCURRENT = 5
sem = asyncio.Semaphore(MAX_CONCURRENT)
limiter = AsyncLimiter(60, 60)  # 60 calls per 60 seconds

@on_exception(expo, RateLimitError, max_time=60)
async def safe_tag_simple(example, taxonomy: Taxonomy):
    fmt = dblp_format(example)
    id = example["id"]
    response = await simple_tagger(fmt, taxonomy, model="gpt-4o")
    return {"id": id, "example": example, "response": response}

async def get_tags_simple(dataset, samples, taxonomy: Taxonomy,  partition = "train"):
    # allow either an int (take-first-N) or an iterable of indices
    idx = range(samples) if isinstance(samples, int) else samples

    async def worker(example):
        async with sem, limiter:
            return await safe_tag_simple(example, taxonomy)

    tasks = []
    for i in idx:
        tasks.append(asyncio.create_task(worker(dataset[partition][i])))

    return await asyncio.gather(*tasks)


In [113]:
train_tags_simple = await get_tags_simple(dataset, range(2), taxonomy, partition="train")
test_tags_simple = await get_tags_simple(dataset, range(1), taxonomy, partition = "test")

In [114]:
tagged_files_train_simple = './tagged/train_simple.json'
tagged_files_test_simple = './tagged/test_simple.json'

with open(tagged_files_train_simple, 'w') as f:
    json.dump(train_tags_simple, f)
with open(tagged_files_test_simple, 'w') as f:
    json.dump(test_tags_simple, f)



In [115]:
flatten_examples(tagged_files_train_simple)
flatten_examples(tagged_files_test_simple)

Record file: ./tagged/train_simple.json
./tagged/train_simple_flat.json
Record file: ./tagged/test_simple.json
./tagged/test_simple_flat.json


In [122]:
train_simple_flat_file = './tagged/train_simple_flat.json'
train_flat_file = './tagged/train_flat.json'
with open(train_flat_file, 'r') as f:
    train_flat = json.load(f)

with open(train_simple_flat_file, 'r') as f:
    train_simple_flat = json.load(f)




0.0

In [126]:

from src.tagging.evaluation import calculate_top_k_accuracy
calculate_top_k_accuracy(train_simple_flat, train_flat, k=1)

0.6666666666666666