In [19]:
import json
from datasets import Dataset
def build_pairs(claims_fp, dense_fp):
    claims = json.load(open(claims_fp, 'r', encoding='utf-8'))
    dense  = json.load(open(dense_fp, 'r', encoding='utf-8'))
    pairs  = []
    for cid, c_info in claims.items():
        text_c = c_info["claim_text"]
        pos_set = set(c_info["evidences"])
        for cand in dense[cid]["ranked_evidences"]:
            pairs.append({
                "cid": cid,
                "evid_id": cand["id"],
                "claim_text": text_c,
                "evid_text": cand["text"],
                "label": int(cand.get("id") in pos_set)
            })
    return pairs

In [20]:
train_pairs = build_pairs("./data/train-claims.json", "./data/train-claims-top20-text.json")
dev_pairs = build_pairs("./data/dev-claims.json", "./data/dev-claims-top20-text.json")

In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import f1_score

# 1. HF Dataset
train_ds = Dataset.from_list(train_pairs)
dev_ds   = Dataset.from_list(dev_pairs)

# 2. tokenizer + map
MODEL_NAME = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)
def preprocess(batch):
    enc = tokenizer(
        batch["claim_text"],
        batch["evid_text"],
        truncation='only_second',
        padding='max_length',
        max_length=256
    )
    enc['labels'] = batch['label']
    enc['cid']    = batch['cid']
    return enc
train_ds = train_ds.map(preprocess, batched=True)
dev_ds   = dev_ds.map(preprocess, batched=True)

# 3. model + collator
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
)
model.gradient_checkpointing_enable()

data_collator = DataCollatorWithPadding(tokenizer)

# 4. TrainingArguments
args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    fp16=True,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# 5. compute_metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"f1": f1_score(labels, preds)}

# 6. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/24560 [00:00<?, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.2088,0.217234,0.0
2,0.1974,0.23608,0.0
3,0.157,0.244454,0.022857


TrainOutput(global_step=9210, training_loss=0.2048562290614125, metrics={'train_runtime': 898.4161, 'train_samples_per_second': 82.011, 'train_steps_per_second': 10.251, 'total_flos': 4880098966487040.0, 'train_loss': 0.2048562290614125, 'epoch': 3.0})

In [22]:
from scipy.special import softmax
from collections import defaultdict

preds = trainer.predict(dev_ds).predictions  # shape (num_examples, 1)
print(preds)
scores = preds.squeeze(-1)

top6_per_claim = defaultdict(list)
for i, pair in enumerate(dev_pairs):
    cid = pair['cid']
    eid = pair['evid_id']
    top6_per_claim[cid].append((eid, scores[i].item()))

output = {
    cid: [eid for eid, _ in sorted(lst, key=lambda x: x[1], reverse=True)[:6]]
    for cid, lst in top6_per_claim.items()
}

with open("dev-claims-top6-dense-distillbert.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

[[ 2.1855469 -2.4472656]
 [ 2.1738281 -2.4277344]
 [ 2.1738281 -2.4277344]
 ...
 [ 2.1445312 -2.3886719]
 [ 2.078125  -2.3066406]
 [ 2.1601562 -2.4101562]]


ValueError: cannot select an axis to squeeze out which has size not equal to one