## add
- post_validate.py : remove fp
- predict.py : use llb to predict primary or secondary

In [1]:
! uv pip uninstall --system 'tensorflow'
! uv pip install --system --no-index --find-links='/kaggle/input/latest-mdc-whls/whls' 'pymupdf' 'vllm' 'triton' 'logits-processor-zoo' 'numpy<2'
! mkdir -p /tmp/src

[2mUsing Python 3.11.13 environment at: /usr[0m
[2mUninstalled [1m1 package[0m [2min 2.53s[0m[0m
 [31m-[39m [1mtensorflow[0m[2m==2.18.0[0m
[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m157 packages[0m [2min 233ms[0m[0m
[2K[2mPrepared [1m52 packages[0m [2min 24.14s[0m[0m
[2mUninstalled [1m14 packages[0m [2min 149ms[0m[0m
[2K[2mInstalled [1m52 packages[0m [2min 128ms[0m[0m
 [32m+[39m [1mairportsdata[0m[2m==20250622[0m
 [32m+[39m [1mastor[0m[2m==0.8.1[0m
 [32m+[39m [1mblake3[0m[2m==1.0.5[0m
 [32m+[39m [1mcompressed-tensors[0m[2m==0.9.3[0m
 [32m+[39m [1mdepyf[0m[2m==0.18.0[0m
 [32m+[39m [1mdiskcache[0m[2m==5.6.3[0m
 [32m+[39m [1mfastapi-cli[0m[2m==0.0.7[0m
 [32m+[39m [1mgguf[0m[2m==0.17.1[0m
 [32m+[39m [1mhttptools[0m[2m==0.6.4[0m
 [31m-[39m [1mimportlib-metadata[0m[2m==8.7.0[0m
 [32m+[39m [1mimportlib-metadata[0m[2m==8.0.0[0m
 [32m+[39m [1m

In [2]:
%%writefile /tmp/src/helpers.py
import logging, os, kagglehub, inspect
from pathlib import Path
import polars as pl

IS_KAGGLE_ENV = sum(['KAGGLE' in k for k in os.environ]) > 0
IS_KAGGLE_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
COMP_DIR = Path(('/kaggle/input/make-data-count-finding-data-references' if IS_KAGGLE_SUBMISSION else kagglehub.competition_download('make-data-count-finding-data-references')))
PDF_DIR = COMP_DIR / ('test' if IS_KAGGLE_SUBMISSION else 'train') / 'PDF'
WORKING_DIR = Path(('/kaggle/working/' if IS_KAGGLE_ENV else '.working/'))

DOI_LINK = 'https://doi.org/'

DEFAULT_LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG").upper() if not IS_KAGGLE_SUBMISSION else "WARNING"
LOG_FILE_PATH = os.getenv("LOG_FILE", "logs/project.log")
LOG_DIR = Path(LOG_FILE_PATH).parent

LOG_DIR.mkdir(parents=True, exist_ok=True)

LOG_FORMAT = "%(levelname)s %(asctime)s  [%(filename)s:%(lineno)d - %(funcName)s()] %(message)s"
LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"

def get_logger(name=None):
    if name is None:
        frame = inspect.currentframe()
        if frame is None or frame.f_back is None:
            name = "__main__"
        else:
            name = frame.f_back.f_globals.get("__name__", "__main__")

    logger = logging.getLogger(name)

    if not logger.handlers:
        logger.setLevel(DEFAULT_LOG_LEVEL)
        formatter = logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_DATEFMT)
        ch = logging.StreamHandler()
        ch.setLevel(DEFAULT_LOG_LEVEL)
        ch.setFormatter(formatter)
        fh = logging.FileHandler(LOG_FILE_PATH)
        fh.setLevel(DEFAULT_LOG_LEVEL)
        fh.setFormatter(formatter)
        logger.addHandler(ch)
        logger.addHandler(fh)
        logger.propagate = False
    return logger

def is_doi_link(name: str) -> pl.Expr:
    return pl.col(name).str.starts_with(DOI_LINK).and_(
        ~pl.col(name).str.contains(r"/dl\.")
    )

def string_normalization(name: str) -> pl.Expr:
    return pl.col(name).str.normalize("NFKC").str.replace_all(r"[^\p{Ascii}]", '').str.replace_all(r"https?://zenodo\.org/record/(\d+)", r" 10.5281/zenodo.$1 ")

def get_df(parse_dir: str):
    records = []
    txt_files = list(Path(parse_dir).glob('*.txt'))
    for txt_file in txt_files:
        id_ = txt_file.stem
        with open(txt_file, 'r') as f:
            text = f.read()
        records.append({'article_id': id_, 'text': text})
    return pl.DataFrame(records).with_columns(string_normalization('text').alias('text'))

def assume_type(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.with_columns(pl.when(is_doi_link('dataset_id').or_(pl.col('dataset_id').str.starts_with('SAMN'))).then(pl.lit('Primary')).otherwise(pl.lit('Secondary')).alias('type'))
    )

def score(df, gt, on, tag='all'):
    hits = gt.join(df, on=on)
    tp = hits.height
    fp = df.height - tp
    fn = gt.height - tp
    f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0
    return f"{tag} - f1: {f1:.4f} [{tp}/{fp}/{fn}]"

def evaluate(df, on=['article_id', 'dataset_id']):
    gt = pl.read_csv(COMP_DIR/'train_labels.csv').filter(pl.col('type')!='Missing')
    return (
        score(df, gt, on),
        score(df.filter(is_doi_link('dataset_id')), gt.filter(is_doi_link('dataset_id')), on, 'doi'),
        score(df.filter(~is_doi_link('dataset_id')), gt.filter(~is_doi_link('dataset_id')), on, 'acc'),
    )

Writing /tmp/src/helpers.py


In [3]:
%%writefile /tmp/src/parse.py
import argparse
from pathlib import Path
import pymupdf
from helpers import get_logger, PDF_DIR

l = get_logger()

def pdf_to_txt(output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)
    pdf_files = list(PDF_DIR.glob("*.pdf")) + list(PDF_DIR.glob("*.PDF"))
    existing_txt_files = {f.stem for f in output_dir.glob("*.txt")}
    for pdf_file in pdf_files:
        txt_file = output_dir / f"{pdf_file.stem}.txt"
        if pdf_file.stem in existing_txt_files:
            continue
        try:
            text = ""
            with pymupdf.open(pdf_file) as doc:
                for page in doc:
                    text += page.get_text()
            txt_file.write_text(text, encoding='utf-8')
        except Exception:
            pass

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir', type=Path, help='Directory to save text files')
    args = parser.parse_args()
    pdf_to_txt(args.output_dir)

if __name__ == "__main__":
    main()

Writing /tmp/src/parse.py


In [4]:
%%writefile /tmp/src/check_parse.py
import polars as pl
from pathlib import Path
from helpers import *

l=get_logger()

def gt_dataset_id_normalization(name:str) -> pl.Expr:
    return (
        pl.when(is_doi_link(name))
        .then(pl.col(name).str.split(DOI_LINK).list.last())
        .otherwise(name)
        .str.to_lowercase()
    )

def main():
    if IS_KAGGLE_SUBMISSION:
        l.debug('skipping check_parse for submission')
        return
    df = (
        get_df('/tmp/train_parse')
        .with_columns(pl.col('text').str.replace_all('\s+', '').str.to_lowercase().alias('text'))
    )

    gt = (
        pl.read_csv(COMP_DIR/'train_labels.csv')
        .filter(pl.col('article_id').is_in(df['article_id']))
        .filter(pl.col('type')!='Missing')
        .with_columns(gt_dataset_id_normalization('dataset_id').alias('norm_id'))
    )

    l.info(f"pymupdf misses: {gt.join(df, on='article_id').with_columns(hit=pl.col('text').str.contains(pl.col('norm_id'), literal=True)).filter(~pl.col('hit')).height} dataset_ids")

if __name__=='__main__': main()

Writing /tmp/src/check_parse.py


In [5]:
%%writefile /tmp/src/getid.py
import re
import polars as pl
from typing import Optional, Tuple

from helpers import *
# --- Constants & Regex ---

DOI_LINK = "https://doi.org/"

# DOI per Crossref style; tolerate whitespace & trailing punctuation
DOI_RE = r"10\s*\.\s*\d{4,9}\s*/\s*\S+"
DOI_CLEAN_TRAIL = r"[^A-Za-z0-9/:._-]+$"   # trim trailing punctuation
DOI_BAD_PREFIXES = ("10.5061/dryad", "10.5281/zenodo", "10.6073/pasta")

# Reference headers (exclude Acknowledgments as it often precedes refs)
REF_HEADER_TERMS = [
    r"REFERENCES", r"REFERENCE", r"BIBLIOGRAPHY",
    r"LITERATURE\s+CITED", r"WORKS\s+CITED", r"CITED\s+WORKS",
    r"REFERENCES\s+AND\s+NOTES", r"NOTES\s+AND\s+REFERENCES"
]
COMPILED_PATTERNS = {
    "ref_header_patterns": [re.compile(rf"\b({t})\b[:\s]*", re.I) for t in REF_HEADER_TERMS],
    "citation_pattern": re.compile(r"^\s*(\[\d+\]|\(\d+\)|\d+\.\s|\d+\)\s|\d+(?=\s|$))\s*"),
    "first_citation_patterns": [
        re.compile(r"^\s*\[1\]\s*"), re.compile(r"^\s*\(1\)\s*"),
        re.compile(r"^\s*1\.\s*"),   re.compile(r"^\s*1\)\s*"),
        re.compile(r"^\s*1(?=\s|$)")
    ],
    "doi": re.compile(DOI_RE, re.I),
    "doi_trail": re.compile(DOI_CLEAN_TRAIL),
}
def find_last_reference_header(text: str, header_patterns: list[re.Pattern]) -> Optional[int]:
    last = None
    for pat in header_patterns:
        for m in pat.finditer(text):
            last = m.start() if (last is None or m.start() > last) else last
    return last
def find_last_reference_header(text: str, header_patterns: list[re.Pattern]) -> Optional[int]:
    last = None
    for pat in header_patterns:
        for m in pat.finditer(text):
            last = m.start() if (last is None or m.start() > last) else last
    return last
def _extract_normalize_doi(df: pl.DataFrame, col: str, article_col: str = "article_id") -> pl.DataFrame:
    return (
        df.with_columns(
            pl.col(col)
              .str.extract_all(DOI_RE, literal=False)
              .alias("match")
        )
        .explode("match")
        .drop_nulls("match")
        .with_columns(
            pl.col("match")
              .str.replace_all(r"\s+", "")
              .str.replace(DOI_CLEAN_TRAIL, "")   # strip trailing punctuation
              .str.to_lowercase()
              .alias("dataset_id")
        )
        .group_by(article_col, "dataset_id")
        .agg(pl.col("match"))
        .with_columns((DOI_LINK + pl.col("dataset_id")).alias("dataset_id"))
    )
def doi_gbif_ids(df: pl.DataFrame) -> pl.DataFrame:
    # Extract windows around the keyphrase and find DOIs in those windows
    # Regex: up to ~140 chars after the phrase to catch nearby DOIs
    gbif_re = r"GBIF\s+Occurrence.{0,140}"  # tune span if needed

    gbif = (
        df.select("article_id", "ref")
          .with_columns(
              pl.col("ref")
                .str.extract_all(gbif_re)  # list of windows
                .alias("gbif_windows")
          )
          .explode("gbif_windows")
          .drop_nulls("gbif_windows")
          .with_columns(
              pl.col("gbif_windows")
                .str.extract_all(DOI_RE)
                .alias("match")
          )
          .explode("match")
          .drop_nulls("match")
          .with_columns(
              pl.col("match")
                .str.replace_all(r"\s+", "")
                .str.replace(DOI_CLEAN_TRAIL, "")
                .str.to_lowercase()
                .alias("dataset_id")
          )
          .with_columns((DOI_LINK + pl.col("dataset_id")).alias("dataset_id"))
          .group_by("article_id", "dataset_id")
          .agg(pl.col("match"))
    )
    return gbif
def tidy_extraction(df: pl.DataFrame) -> pl.DataFrame:
    bad_ids = [f"{DOI_LINK}{e}" for e in DOI_BAD_PREFIXES]

    doi_body = _extract_normalize_doi(df, "body")
    doi_ref  = _extract_normalize_doi(df, "ref")
    doi_gbif = doi_gbif_ids(df)

    # Accession-like IDs (kept as in your version, but compact)
    REGEX_IDS = (
        r"(?i)\b(?:"
        r"CHEMBL\d+|E-GEOD-\d+|E-PROT-\d+|E-MTAB-\d+|E-MEXP-\d+|EMPIAR-\d+|"
        r"ENSBTAG\d+|ENSOARG\d+|"
        r"EPI_ISL_\d{5,}|EPI\d{6,7}|"
        r"HPA\d+|CP\d{6}|IPR\d{6}|PF\d{5}|BX\d{6}|KX\d{6}|K0\d{4}|CAB\d{6}|"
        r"NC_\d{6}\.\d{1}|NM_\d{9}|"
        r"PRJNA\d+|PRJEB\d+|PRJDB\d+|PXD\d+|SAMN\d+|"
        r"GSE\d+|GSM\d+|GPL\d+|"
        r"PDB\s?[1-9][A-Z0-9]{3}|HMDB\d+|"
        r"dryad\.[^\s\"<>]+|pasta\/[^\s\"<>]+|"
        r"(?:SR[PRX]|STH|ERR|DRR|DRX|DRP|ERP|ERX)\d+|"
        r"CVCL_[A-Z0-9]{4}|"
        r"[1-5]\.(?:10|20|30|40|50|60|70|80|90)\.\d{2,4}\.\d{2,4}"
        r")"
    )

    acc_df = (
        df.with_columns(pl.col("text").str.extract_all(REGEX_IDS).alias("match"))
          .explode("match")
          .drop_nulls("match")
          .with_columns(
              pl.col("match")
                .str.replace_all(r"\s+", "")
                .str.replace(DOI_CLEAN_TRAIL, "")
                .str.replace(r"(?i)^PDB", "")  # keep your original PDB trim
                .alias("dataset_id")
          )
          .group_by("article_id", "dataset_id")
          .agg(pl.col("match"))
          .with_columns(
              pl.when(pl.col("dataset_id").str.starts_with("dryad."))
                .then(DOI_LINK + "10.5061/" + pl.col("dataset_id"))
                .otherwise(pl.col("dataset_id"))
                .alias("dataset_id")
          )
          .with_columns(
              pl.when(pl.col("dataset_id").str.starts_with("pasta/"))
                .then(DOI_LINK + "10.6073/" + pl.col("dataset_id"))
                .otherwise(pl.col("dataset_id"))
                .alias("dataset_id")
          )
    )

    out = pl.concat([doi_body, doi_ref, doi_gbif, acc_df], how="vertical")

    # Safe self-filter: escape both sides; also remove known-bad hosts
    out = (
        out.unique(["article_id", "dataset_id"])
           .with_columns(
               pl.col("article_id").str.replace("_", "/").alias("article_id_slash"),
               pl.col("dataset_id").str.split(DOI_LINK).list.last().alias("doi_tail"),
           )
           .filter(~pl.col("article_id_slash").str.contains(pl.col("doi_tail").str.escape_regex()))
           .filter(~pl.col("dataset_id").str.contains(pl.col("article_id_slash").str.escape_regex()))
           .filter(~pl.col("dataset_id").str.contains("figshare", literal=True))
           .filter(~pl.col("dataset_id").is_in(bad_ids))
           .with_columns(pl.col("match").list.unique())
           .drop(["article_id_slash", "doi_tail"])
    )
    return out
def get_context_window(text: str, substring: str, window: int = 100) -> str:
    idx = text.find(substring)
    if idx == -1:
        # try case-insensitive fallback
        idx = text.lower().find(substring.lower())
        if idx == -1:
            return text[:window]  # graceful fallback
    start = max(idx - window, 0)
    end = min(idx + len(substring) + window, len(text))
    return text[start:end]

def get_window_df(text_df: pl.DataFrame, ids_df: pl.DataFrame) -> pl.DataFrame:
    df = ids_df.join(text_df, on="article_id")
    # pick the first match safely
    df = df.with_columns(
        pl.when(pl.col("match").list.len() > 0)
          .then(pl.col("match").list.first())
          .otherwise(pl.lit(""))
          .alias("first_match")
    )
    windows = [
        get_context_window(t, m) for t, m in df.select("text", "first_match").rows()
    ]
    return df.with_columns(pl.Series("window", windows)).select("article_id", "dataset_id", "window")
def write_the_match(text_df: pl.DataFrame, id_df: pl.DataFrame) -> None:
    df = id_df.join(text_df, on="article_id")
    records = []
    for art_id, dataset_id, match_ids, text in df.select("article_id","dataset_id","match","text").rows():
        match0 = match_ids[0] if match_ids else ""
        records.append({"article_id": art_id, "dataset_id": dataset_id, "match": match0, "text": text})
    pl.DataFrame(records).write_parquet("/tmp/context_data.parquet")


Writing /tmp/src/getid.py


In [6]:
%%writefile /tmp/src/llm_validate.py
import polars as pl
import os

from helpers import *

l = get_logger()

SYS_PROMPT_CLASSIFY_DOI = """
1. Priority Rules (highest → lowest)
1.1 Always classify as A (Data) if:
DOI prefix matches a known data repository:

Dryad: 10.5061

Zenodo: 10.5281

Dl: 10.15468

ICPSR: 10.3886

USGS data: 10.5066

Mendeley Data: 10.17632

Dataverse: 10.7910/DVN

OpenNeuro: 10.18112/openneuro.

PANGAEA: 10.1594/PANGAEA.


2. Classify as B (Literature) if:
DOI prefix belongs to a publisher (e.g., 10.1038, 10.1007, 10.1126, 10.1016, 10.1101, 10.1021, 10.1145, 10.1177, 10.1093, 10.1080, 10.1111, etc.).

Context indicates a journal article, book, conference paper, preprint, protocol, or method paper, without any repository/data storage signal.

Mentions only “supplementary material” or “supplementary information” without a repository.

3. Ambiguous cases
No repository prefix and no clear context → default to B.


4. Output
Only output:

A → data repository / dataset

B → literature / non-data resource

Few-shot examples

“Raw images are stored on Figshare (DOI 10.6084/m9.figshare.1234567).” → A

“Sequence reads available under BioProject accession PRJNA765432.” → A

“As described in Nature Methods (DOI 10.1038/s41592-020-0793-2).” → B

“See Supplementary Data at Zenodo (10.5281/zenodo.987654).” → A

“Method details published in J. Proteome Res. DOI: 10.1021/acs.jproteome.0c00845.” → B

“Data uploaded to Dryad (10.5061/dryad.x1y2z3).” → A

“Referenced paper: DOI 10.1101/2020.01.01.123456 (bioRxiv preprint).” → B

“Metabolomics data in MetaboLights MTBLS1234.” → A

“The MRI scans are deposited at OpenNeuro (DOI 10.18112/openneuro.ds000001.v1.0.0).” → A

“Protein structure described in Science (DOI 10.1126/science.abc1234).” → B
""".strip()

def build_df():
    df = pl.read_parquet('/tmp/extracted.parquet')
    df.filter(~is_doi_link('dataset_id')).select('article_id', 'dataset_id').write_csv('/tmp/accid_sub.csv')
    return df.filter(is_doi_link('dataset_id'))

def build_prompt(tokenizer, df):
    prompts = []
    for doi, text in df.select('dataset_id', 'window').rows():
        messages = [{'role':'system','content': SYS_PROMPT_CLASSIFY_DOI}, {'role':'user', 'content': text}]
        prompts.append(tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False))
    return df.with_columns(pl.Series('prompt', prompts))

if __name__=='__main__':
    os.environ["VLLM_USE_V1"] = "0"
    import vllm
    from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
    model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
    llm = vllm.LLM(model_path, quantization='awq', tensor_parallel_size=2, gpu_memory_utilization=0.9, trust_remote_code=True, dtype="half", enforce_eager=True, max_model_len=2048, disable_log_stats=True, disable_custom_all_reduce=True, enable_prefix_caching=True, task='generate')
    tokenizer = llm.get_tokenizer()
    df = build_df()
    df = build_prompt(tokenizer, df)
    prompts = df['prompt'].to_list()
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=["A", "B"])
    outputs = llm.generate(prompts, vllm.SamplingParams(seed=777, temperature=0.2, skip_special_tokens=True, max_tokens=1, logits_processors=[mclp], logprobs=len(mclp.choices)), use_tqdm=True)
    logprobs = [{lp.decoded_token: lp.logprob for lp in list(lps)} for lps in [output.outputs[0].logprobs[0].values() for output in outputs]]
    choices = [max(d, key=d.get) for d in logprobs]
    types = {'A': True, 'B': False}
    choices = [types[c] for c in choices]
    df = df.with_columns(pl.Series('type', choices))
    df.filter(pl.col('type')).select('article_id', 'dataset_id').write_csv('/tmp/doi_sub.csv')
    df = pl.concat([pl.read_csv('/tmp/doi_sub.csv'), pl.read_csv('/tmp/accid_sub.csv')])
    df = assume_type(df)
    df.select(['article_id', 'dataset_id', 'type']).with_row_index(name='row_id').write_csv('/kaggle/working/submission.csv')
    if not IS_KAGGLE_SUBMISSION:
        results = evaluate(df)
        for r in results: l.info(r) 
        results = evaluate(df, on=['article_id', 'dataset_id', 'type'])
        for r in results: l.info(r)


    
    try:
        del llm, tokenizer
    except:
        pass
    
    import gc, torch
    gc.collect()
    torch.cuda.empty_cache()

Writing /tmp/src/llm_validate.py


In [7]:
%%writefile /tmp/src/post_filter.py
import polars as pl
from helpers import *

"""
Fourth essence: Post-filter to cut FP DOIs that look like literature.
- Read /kaggle/working/submission.csv (output of llm_validate.py)
- Join with /tmp/extracted.parquet to get context window
- Drop DOI rows that (1) start with typical publisher prefixes AND (2) have no data-ish words nearby
- Keep accessions untouched
"""

l = get_logger()

PAPER_PREFIXES = [
    "10.5061","10.5281","10.17632","10.1594","10.15468","10.17882","10.7937","10.7910","10.6073",
    "10.3886","10.3334","10.4121","10.5066","10.5067","10.18150","10.25377","10.25387","10.23642","10.24381","10.22033"
]

CONTEXT_RE = r"(?i)\b(data(?:set)?|repository|archive|deposited|available|supplementary|raw(?:\s+data)?|uploaded|hosted|stored|accession)\b"

def is_paper_prefix(col: str = "dataset_id") -> pl.Expr:
    expr = pl.lit(False)
    for p in PAPER_PREFIXES:
        expr = expr | pl.col(col).str.starts_with(f"{DOI_LINK}{p}")
    return expr

def main():
    sub = pl.read_csv("/kaggle/working/submission.csv")

    # Normalize columns: drop row_id if present so concat widths match
    if "row_id" in sub.columns:
        sub = sub.drop("row_id")

    # Context windows
    win = pl.read_parquet("/tmp/extracted.parquet").select("article_id", "dataset_id", "window")

    # DOI & ACC split
    doi_rows = sub.filter(is_doi_link("dataset_id")).join(win, on=["article_id", "dataset_id"], how="left")
    acc_rows = sub.filter(~is_doi_link("dataset_id"))

    keep_mask = (
        (~is_paper_prefix("dataset_id"))  # not a known paper prefix
        | doi_rows["window"].fill_null("").str.contains(CONTEXT_RE)
    )

    kept_doi = doi_rows.filter(keep_mask).select("article_id", "dataset_id", "type")
    final = pl.concat([kept_doi, acc_rows.select("article_id", "dataset_id", "type")])

    # Re-eval & save
    if not IS_KAGGLE_SUBMISSION:
        for r in evaluate(final): l.info(r)
        for r in evaluate(final, on=["article_id", "dataset_id", "type"]): l.info(r)

    final.with_row_index("row_id").write_csv("/kaggle/working/submission.csv")

if __name__ == "__main__":
    main()

Writing /tmp/src/post_filter.py


In [8]:
%%writefile /tmp/src/post_validate.py

from helpers import *
import polars as pl
import os


l = get_logger()


PROMPT_CLASSIFY_CITATION_TYPE = '''
# Role & Task
You are an expert data citation analyst. Your task is to classify a given citation from a scientific paper into one of two categories: **A** (Data) or **B** (Not Data). Base your decision strictly on the provided abstract and the context of the citation.

## Instructions
1.  **Read the provided abstract** to understand the research context.
2.  **Analyze the citation context** for key linguistic cues.
3.  **Classify the citation** as either **A** or **B** based on the definitions below.
4.  **Output only a single letter: A or B.** Do not output any other text, explanation, or formatting.

## Category Definitions

### **Category A: DATA**
The citation points to a dataset. This includes:
*   **Primary Data:** Raw or processed data that the current study's authors collected, generated, or created.
*   **Secondary Data:** Data that was originally produced by other researchers but is being *used as a dataset* in the current study.
*   **Key Phrases:** "data are available at", "we collected", "we measured", "data were obtained from", "dataset", "downloaded from", "deposited in", repository names (e.g., GenBank, Zenodo, Figshare, TCIA).

### **Category B: NOT DATA**
The citation points to a traditional scholarly publication or other non-data resource. This includes:
*   Journal articles, books, conference proceedings, preprints, protocols, methods papers.
*   **Key Phrases:** "as described in", "according to", "previous study", "et al.", "paper", "article", "methodology", "was used for analysis" (without indicating data access).
*   Citations that provide background context or methodological description but do not serve as the source of the data used in the analysis.

## Input Format
You will be provided with the following three pieces of information:
Paper Abstract: {abstract}
Citation: {dataset_id}
Citation Context: {context}

## Critical Thinking Guidelines
*   A DOI or URL can point to either data (A) or a paper (B). The context determines the classification.
*   If the citation is used to describe the *source* of the data for the current study's analysis, it is likely **A**.
*   If the citation is used to provide background, justify a method, or compare results, it is likely **B** (a reference to another paper).
*   When in doubt, rely on the linguistic cues in the "Citation Context".

## Examples for Pattern Recognition

**Example 1 (Classify as A):**
*   Context: "Three out of four cohorts used in this study can be found on The Cancer Imaging Archive (TCIA)24: Canadian benchmark dataset23: https://doi.org/10.7937/K9/TCIA.2017.8oje5q00."
*   **Reasoning:** The text states cohorts are "used in this study" and provides direct repository links. This is a clear case of citing external data for use.
*   **Output:** A

**Example 2 (Classify as B):**
*   Context: "data presented here are available at the SEANOE dataportal: https://doi.org/10.17882/94052 (ZooScan dataset Grandremy et al. 2023c)"
*   **Reasoning:** The phrase "data presented here" indicates this is the authors' own data being deposited, not a citation to an external source they are using. The "(Author et al. Year)" format is a classic literature citation style.
*   **Output:** B

**Example 3 (Classify as A):**
*   Context: "GBIF occurrence data: Vulpes vulpes: https://doi.org/10.15468/dl.wgtneb (28 May 2021)."
*   **Reasoning:** Explicitly names the data source (GBIF) and provides a direct access link/DOI for the specific dataset used.
*   **Output:** A

**Example 4 (Classify as B):**
*   Context: "North American soil NCBI SRA SRP035367 Smith & Peay [36] ITS2-Soil"
*   **Reasoning:** While it mentions a data repository ID (SRP035367), it couples it with a standard literature citation "[36]". The context suggests it is referencing the *paper* by Smith & Peay that describes the data, not directly citing the dataset itself for use.
*   **Output:** B

## Ready for Input
Begin your analysis. Remember: Output only **A** or **B**.
'''

def get_context_window(text: str, substring: str, window: int = 600) -> str:
    idx = text.find(substring)
    if idx == -1:
        return "no context", "no abstraction"
    start = max(idx - window, 0)
    end = min(idx + len(substring) + window, len(text))
    return text[start:end] , text[:1000]




def find_context_win(tokenizer,df):
    text_df = pl.read_parquet('/tmp/context_data.parquet')
    # print(text_df)
    df = df.join(text_df, on=["article_id","dataset_id"], how="inner")
    df = df.drop("type")
    print(df)

    prompts = []
    
    for article_id,dataset_id,text,match in df.select(["article_id","dataset_id","text",'match']).rows():

        context, abstract = get_context_window(text,match)
        user_content = f"""
        Paper Abstract: {abstract}
        
        Citation: {dataset_id}

        
        Citation Context: {context}
        """
        messages = [
            {"role": "system", "content": PROMPT_CLASSIFY_CITATION_TYPE},
            {"role": "user", "content": user_content.strip()}
        ]
        prompts.append(
            tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        )
        
    return df.with_columns(pl.Series("prompt", prompts))

    

if __name__=="__main__":
    os.environ["VLLM_USE_V1"] = "0"
    MODEL_PATH = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
    import vllm
    from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor

    llm = vllm.LLM(
        MODEL_PATH,
        quantization='awq',
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=16384,
        disable_log_stats=True, 
        disable_custom_all_reduce=True,
        enable_prefix_caching=True,
        task='generate')

    tokenizer = llm.get_tokenizer()

    df=pl.read_csv("/kaggle/working/submission.csv")
    
    if "row_id" in df.columns:
        df = df.drop("row_id")

    # print(df)

    doi_df = df.filter(is_doi_link("dataset_id"))
    acc_df = df.filter(~is_doi_link("dataset_id"))

    # print(doi_df)

    df = find_context_win(tokenizer,doi_df)

    
    
    prompts = df['prompt'].to_list()
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=["A", "B","C"])
    outputs = llm.generate(prompts, vllm.SamplingParams(seed=777, temperature=0.7, skip_special_tokens=True, max_tokens=1, logits_processors=[mclp], logprobs=len(mclp.choices)), use_tqdm=True)
    logprobs = [{lp.decoded_token: lp.logprob for lp in list(lps)} for lps in [output.outputs[0].logprobs[0].values() for output in outputs]]
    choices = [max(d, key=d.get) for d in logprobs]
    types = {'A': True, 'B': False}
    choices = [types[c] for c in choices]
    df = df.with_columns(pl.Series('type', choices))
    df.filter(pl.col('type')).select('article_id', 'dataset_id').write_csv('/tmp/doi_sub.csv')
    df = pl.concat([pl.read_csv('/tmp/doi_sub.csv'), pl.read_csv('/tmp/accid_sub.csv')])
    df = assume_type(df)
    df.select(['article_id', 'dataset_id', 'type']).with_row_index(name='row_id').write_csv('/kaggle/working/submission.csv')
    # print(df)
    if not IS_KAGGLE_SUBMISSION:
        results = evaluate(df)
        for r in results: l.info(r) 
        results = evaluate(df, on=['article_id', 'dataset_id', 'type'])
        for r in results: l.info(r)
    
    
    try:
        del llm, tokenizer
    except:
        pass
    
    import gc, torch
    gc.collect()
    torch.cuda.empty_cache()

Writing /tmp/src/post_validate.py


In [9]:
%%writefile /tmp/src/predict.py

from helpers import *
import polars as pl
import os


l = get_logger()


PROMPT_CLASSIFY_CITATION_TYPE = '''
# Role & Task
You are an expert data citation analyst. Your task is to classify a given citation from a scientific paper into one of two categories based on the context: **A (Primary Data)** or **B (Secondary Data)**.

## Instructions
1.  **Read the provided abstract** to understand the research context.
2.  **Analyze the citation context** for key linguistic cues.
3.  **Classify the citation** as either **A** or **B** based on the definitions below.
4.  **Output only a single letter: A or B.** Do not output any other text, explanation, or formatting.

## Category Definitions

### **Category A: PRIMARY DATA**
The data was generated, collected, or created by the **authors of the current study**. This is *their* data.
*   **Key Phrases:** "we collected", "we generated", "our data", "data are available at [URL/DOI]", "data have been deposited", "this study presents", "supplementary data".

### **Category B: SECONDARY DATA**
The data was produced by **other researchers** or external sources and is being reused or analyzed by the current study's authors.
*   **Key Phrases:** "data were obtained from", "publicly available data", "previously published data", "retrieved from", "downloaded from", "[Dataset Name] dataset", "database", citing a specific external source.

## Input Format
You will be provided with the following three pieces of information:
Paper Abstract: {abstract}
Citation: {dataset_id}
Citation Context: {context}


## Decision Framework
Answer these questions based on the **Citation Context**:

1.  **Who is the source of the data?**
    *   If the context implies the **authors themselves** are the source (e.g., "we," "our"), classify as **A**.
    *   If the context names an **external source** (e.g., a repository, another study, a database), classify as **B**.

2.  **What is the action being described?**
    *   **A (Primary)** actions: *depositing, making available, presenting* their own data.
    *   **B (Secondary)** actions: *using, obtaining, accessing, downloading, analyzing* existing data from elsewhere.

## Examples for Pattern Recognition

**Example 1 (Classify as B):**
*   Context: "Three out of four cohorts **used in this study** can be found on The Cancer Imaging Archive (TCIA)24: Canadian benchmark dataset23: https://doi.org/10.7937/K9/TCIA.2017.8oje5q00."
*   **Reasoning:** The authors are describing external datasets they **used** (a Secondary action). The source is TCIA, not themselves.
*   **Output:** B

**Example 2 (Classify as A):**
*   Context: "Additional research data **supporting this publication are available** at 10.25377/sussex.21184705."
*   **Reasoning:** The authors are stating the availability of data that **supports their own publication**. The source is implied to be themselves.
*   **Output:** A

**Example 3 (Classify as B):**
*   Context: "GBIF occurrence data: Vulpes vulpes: https://doi.org/10.15468/dl.wgtneb (28 May 2021)."
*   **Reasoning:** The data is explicitly sourced from an external repository (GBIF). The authors are referring to data they reused.
*   **Output:** B

**Example 4 (Classify as A):**
*   Context: "Data referring to Barbieux et al. (2017; https://doi.org/10.17882/49388) are freely available on SEANOE."
*   **Reasoning:** This is a tricky case. The citation format "(Author et al. Year)" suggests a literature reference. However, the phrase "Data referring to" and the direct data DOI indicate the authors are citing **their own previously published dataset** (from a 2017 paper) that is now available. This is their Primary data.
*   **Output:** A

## Ready for Input
Begin your analysis. Remember: Output only **A** or **B**.

'''

def get_context_window(text: str, substring: str, window: int = 600) -> str:
    idx = text.find(substring)
    if idx == -1:
        return "no context", "no abstraction"
    start = max(idx - window, 0)
    end = min(idx + len(substring) + window, len(text))
    return text[start:end] , text[:1000]




def find_context_win(tokenizer,df):
    text_df = pl.read_parquet('/tmp/context_data.parquet')
    # print(text_df)
    df = df.join(text_df, on=["article_id","dataset_id"], how="inner")
    df = df.drop("type")
    print(df)

    prompts = []
    
    for article_id,dataset_id,text,match in df.select(["article_id","dataset_id","text",'match']).rows():

        context, abstract = get_context_window(text,match)
        user_content = f"""
        Paper Abstract: {abstract}
        
        Citation: {dataset_id}

        
        Citation Context: {context}
        """
        messages = [
            {"role": "system", "content": PROMPT_CLASSIFY_CITATION_TYPE},
            {"role": "user", "content": user_content.strip()}
        ]
        prompts.append(
            tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        )
        
    return df.with_columns(pl.Series("prompt", prompts))

    

if __name__=="__main__":
    os.environ["VLLM_USE_V1"] = "0"
    MODEL_PATH = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
    import vllm
    from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor

    llm = vllm.LLM(
        MODEL_PATH,
        quantization='awq',
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=16384,
        disable_log_stats=True, 
        disable_custom_all_reduce=True,
        enable_prefix_caching=True,
        task='generate')

    tokenizer = llm.get_tokenizer()

    df=pl.read_csv("/kaggle/working/submission.csv")
    
    if "row_id" in df.columns:
        df = df.drop("row_id")


    doi_df = df.filter(is_doi_link("dataset_id"))
    acc_df = df.filter(~is_doi_link("dataset_id"))



    df = find_context_win(tokenizer,doi_df)

    
    
    prompts = df['prompt'].to_list()
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=["A", "B"])
    outputs = llm.generate(prompts, vllm.SamplingParams(seed=777, temperature=0.8, skip_special_tokens=True, max_tokens=1, logits_processors=[mclp], logprobs=len(mclp.choices)), use_tqdm=True)
    logprobs = [{lp.decoded_token: lp.logprob for lp in list(lps)} for lps in [output.outputs[0].logprobs[0].values() for output in outputs]]
    choices = [max(d, key=d.get) for d in logprobs]
    types = {'A':'Primary', 'B':'Secondary'}
    choices = [types[c] for c in choices]


    
    df = df.with_columns(pl.Series('type', choices))
    df.select('article_id', 'dataset_id','type').write_csv('/tmp/doi_sub.csv')

    acc_df = assume_type(acc_df)
    acc_df.select('article_id','dataset_id','type').write_csv("/tmp/accid_sub.csv")
    df = pl.concat([pl.read_csv('/tmp/doi_sub.csv'), pl.read_csv('/tmp/accid_sub.csv')])
    
    df.select(['article_id', 'dataset_id', 'type']).with_row_index(name='row_id').write_csv('/kaggle/working/submission.csv')
    # print(df)
    if not IS_KAGGLE_SUBMISSION:
        results = evaluate(df)
        for r in results: l.info(r) 
        results = evaluate(df, on=['article_id', 'dataset_id', 'type'])
        for r in results: l.info(r)
    
    
    try:
        del llm, tokenizer
    except:
        pass
    
    import gc, torch
    gc.collect()
    torch.cuda.empty_cache()

Writing /tmp/src/predict.py


In [10]:
%cd /tmp
!LOG_LEVEL=INFO python src/parse.py /tmp/train_parse
! python src/check_parse.py
! python src/getid.py
! python src/llm_validate.py




/tmp
MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  an

In [11]:
! python src/post_validate.py


INFO 09-08 15:57:54 [__init__.py:239] Automatically detected platform cuda.
INFO 09-08 15:58:08 [config.py:1770] Defaulting to use mp for distributed inference
INFO 09-08 15:58:08 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=awq, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_

In [12]:
! python src/predict.py

INFO 09-08 16:00:09 [__init__.py:239] Automatically detected platform cuda.
INFO 09-08 16:00:23 [config.py:1770] Defaulting to use mp for distributed inference
INFO 09-08 16:00:23 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=awq, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_

In [13]:
! grep "f1:" /tmp/logs/project.log