## Read Data and Extract DOI Links

In [1]:
import os

# vLLM V1 does not currently accept logits processor so we need to disable it
# https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#deprecated-features
os.environ["VLLM_USE_V1"] = "0"

import re
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import pickle
import vllm
import torch

# Step 1: Read all PDFs and convert to text
pdf_directory = "/kaggle/input/make-data-count-finding-data-references/test/PDF" \
                if os.getenv('KAGGLE_IS_COMPETITION_RERUN') \
                else "/kaggle/input/make-data-count-finding-data-references/train/PDF"
chunks = []
text_span_len = 100
re_doi = re.compile(r"10\.\d{4}")

for filename in tqdm(os.listdir(pdf_directory), total=len(os.listdir(pdf_directory))):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_directory, filename)
        
        # Extract article_id from filename
        article_id = filename.split(".pdf")[0]
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            page_text = page.get_text().lower()
            if 'references' in page_text:
                page_text = page_text.split("references")[0]
                text += page_text
                break
            else:
                text += page_text
            
        doc.close()

        doi_matches = re_doi.finditer(text, re.IGNORECASE)
        for match in doi_matches:
            if match.group() in article_id: continue
            chunk = text[max(0, match.start() - text_span_len): match.start() + text_span_len]
            chunks.append((article_id, chunk))


len(chunks)

INFO 06-17 13:28:32 [__init__.py:244] Automatically detected platform cuda.


2025-06-17 13:28:35.633154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750166916.003128      41 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750166916.112570      41 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


  0%|          | 0/524 [00:00<?, ?it/s]

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: uns

981

## Load LLM

In [2]:
model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"

llm = vllm.LLM(
    model_path,
    quantization='awq',
    tensor_parallel_size=torch.cuda.device_count(),
    gpu_memory_utilization=0.9,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=2048,
    disable_log_stats=True,
    enable_prefix_caching=True
)
tokenizer = llm.get_tokenizer()

INFO 06-17 13:30:49 [config.py:823] This model supports multiple tasks: {'score', 'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 06-17 13:30:51 [config.py:1946] Defaulting to use mp for distributed inference
INFO 06-17 13:30:51 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='xgrammar', disable_fallback=False, disable_any_whitespace=False, disable_additional_prope

2025-06-17 13:30:58.980735: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750167059.007461      93 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750167059.015413      93 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:31:06 [multiproc_worker_utils.py:226] Worker ready; awaiting tasks
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:31:07 [cuda.py:275] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:31:07 [cuda.py:324] Using XFormers backend.


[W617 13:31:18.529540426 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W617 13:31:18.934192165 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W617 13:31:28.540485811 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 06-17 13:31:38 [utils.py:1126] Found nccl from library libnccl.so.2
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:31:38 [utils.py:1126] Found nccl from library libnccl.so.2
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:31:38 [pynccl.py:70] vLLM is using nccl==2.26.2
INFO 06-17 13:31:38 [pynccl.py:70] vLLM is using nccl==2.26.2


[W617 13:31:38.551049619 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 06-17 13:31:39 [custom_all_reduce_utils.py:208] generating GPU P2P access cache in /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 06-17 13:32:14 [custom_all_reduce_utils.py:246] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:32:14 [custom_all_reduce_utils.py:246] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 06-17 13:32:14 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_47af6ca7'), local_subscribe_addr='ipc:///tmp/0507811b-2cc7-43a5-a180-0d2ae41afdc5', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 06-17 13:32:14 [parallel_state.py:1065] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:32:14 [parallel_state.py:1065] rank 1 in world size 2 is assigned as DP rank

Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:35:46 [default_loader.py:272] Loading weights took 211.67 seconds
INFO 06-17 13:35:46 [default_loader.py:272] Loading weights took 211.90 seconds
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:35:47 [model_runner.py:1203] Model loading took 9.0935 GiB and 212.018612 seconds
INFO 06-17 13:35:47 [model_runner.py:1203] Model loading took 9.0935 GiB and 212.250264 seconds
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:36:00 [worker.py:294] Memory profiling takes 12.41 seconds
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:36:00 [worker.py:294] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
[1;36m(VllmWorkerProcess pid=93)[0;0m INFO 06-17 13:36:00 [worker.py:294] model weights take 9.09GiB; non_torch_memory takes 0.11GiB; PyTorch activation peak memory takes 0.44GiB; the rest of the memory reserved for KV Cache is 3.63GiB.
INFO 06-17 13:36:01 [worker.py:29

## Ask LLM to extract DOI links

In [3]:
SYS_PROMPT = """
You are given a piece of academic text. Your task is to identify the single DOI citation string, if present.
Then normalize it into its full URL format: https://doi.org/...
"""

prompts = []
for article_id, academic_text in chunks:
    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": academic_text}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    ) + "Here is the normalized URL: https://doi.org"
    prompts.append(prompt)

outputs = llm.generate(
    prompts,
    vllm.SamplingParams(
        seed=0,
        skip_special_tokens=True,
        max_tokens=32,
        temperature=0
    ),
    use_tqdm=True
)
responses = [output.outputs[0].text for output in outputs]

doi_urls = []

for response in responses:
    doi_url = "https://doi.org" + response.split("\n")[0]
    doi_urls.append(doi_url)

Adding requests:   0%|          | 0/981 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/981 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



## Ask LLM to classify DOI links
Use logits-processor-zoo MultipleChoiceLogitsProcessor to enforce LLM choose between classes.

In [4]:
SYS_PROMPT = """
You are given a piece of academic text. Your task is to identify the single DOI citation string, if present.
Classify the data associated with that DOI as:
A)Primary: if the data was generated specifically for this study.
B)Secondary: if the data was reused or derived from prior work.
C)None: if the DOI is part of the References section of a paper, does not refer to research data or is unrelated.

Respond with one of A, B or C.
"""

prompts = []
for article_id, academic_text in chunks:
    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": academic_text}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )
    prompts.append(prompt)

mclp = MultipleChoiceLogitsProcessor(tokenizer, 
                                     choices=["A", "B", "C"])


outputs = llm.generate(
    prompts,
    vllm.SamplingParams(
        seed=0,
        skip_special_tokens=True,
        max_tokens=1,
        logits_processors=[mclp],
        logprobs=len(mclp.choices)

    ),
    use_tqdm=True
)


logprobs = []
for lps in [output.outputs[0].logprobs[0].values() for output in outputs]:
    logprobs.append({lp.decoded_token: lp.logprob for lp in list(lps)})

logit_matrix = pd.DataFrame(logprobs)[["A", "B", "C"]].values

choices = ["Primary", "Secondary", None]
answers = [choices[pick] for pick in np.argmax(logit_matrix, axis=1)]

Adding requests:   0%|          | 0/981 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/981 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

## Prepare Submission

In [5]:
sub_df = pd.DataFrame()
sub_df["article_id"] = [c[0] for c in chunks]
sub_df["dataset_id"] = doi_urls
sub_df["dataset_id"] = sub_df["dataset_id"].str.lower()
sub_df["type"] = answers
sub_df = sub_df[sub_df["type"].notnull()].reset_index(drop=True)


sub_df = sub_df.sort_values(by=["article_id", "dataset_id", "type"], ascending=False).drop_duplicates(subset=['article_id', 'dataset_id'], keep="first").reset_index(drop=True)

sub_df['row_id'] = range(len(sub_df))
sub_df.to_csv("submission.csv", index=False, columns=["row_id", "article_id", "dataset_id", "type"])

sub_df["type"].value_counts()

type
Primary      196
Secondary     29
Name: count, dtype: int64

## Evaluate validation score

In [6]:
def f1_score(tp, fp, fn):
    return 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0
    
    
if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    pred_df = pd.read_csv("submission.csv")
    label_df = pd.read_csv("/kaggle/input/make-data-count-finding-data-references/train_labels.csv")
    label_df = label_df[label_df['type'] != 'Missing'].reset_index(drop=True)

    hits_df = label_df.merge(pred_df, on=["article_id", "dataset_id", "type"])
    
    tp = hits_df.shape[0]
    fp = pred_df.shape[0] - tp
    fn = label_df.shape[0] - tp
    
    print("TP:", tp)
    print("FP:", fp)
    print("FN:", fn)
    print("F1 Score:", round(f1_score(tp, fp, fn), 3))

TP: 97
FP: 128
FN: 622
F1 Score: 0.206
