## Setup, config, paths, model load

In [2]:
# # method3 — Qwen2.5-7B LLM predictions (all subtasks)
# 
# - Uses Qwen/Qwen2.5-7B-Instruct as a multilingual classifier (no fine-tuning).
# - Works directly on the original language (no translation to English).
# - Outputs per-example predictions for TRAIN and DEV for all 3 subtasks.
# - Caches go into: cache/qwen/<LANG>/
# - Optional Qwen-only submissions go into: submissions/qwen/subtask_X/

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["WANDB_DISABLED"] = "true"

import random
import re
import json
import warnings
from pathlib import Path
from typing import List, Dict, Optional

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
)

import transformers
print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)

# ------------------------------------------------------------
# Device selection: GPU strongly recommended for Qwen
# ------------------------------------------------------------
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ------------------------------------------------------------
# Reproducibility
# ------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ------------------------------------------------------------
# High-level config
# ------------------------------------------------------------
# Change LANG when you switch language (eng, ben, hin, etc.)
LANG = "eng"

BASE = "../dev_phase"    # root of organizer data

# Qwen model:
QWEN_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

# Paths for data (TRAIN has labels, DEV is unlabeled for Codabench)
lang_fname = LANG

T1_TRAIN = f"{BASE}/subtask1/train/{lang_fname}.csv"
T1_DEV   = f"{BASE}/subtask1/dev/{lang_fname}.csv"

T2_TRAIN = f"{BASE}/subtask2/train/{lang_fname}.csv"
T2_DEV   = f"{BASE}/subtask2/dev/{lang_fname}.csv"

T3_TRAIN = f"{BASE}/subtask3/train/{lang_fname}.csv"
T3_DEV   = f"{BASE}/subtask3/dev/{lang_fname}.csv"

# Caches + outputs for Qwen
CACHE_ROOT = Path("cache") / "qwen" / LANG
OUT_ROOT   = Path("outputs") / "qwen" / LANG
SUB_ROOT   = Path("submissions") / "qwen"

for d in [CACHE_ROOT, OUT_ROOT, SUB_ROOT]:
    d.mkdir(parents=True, exist_ok=True)

# Submission subfolders (optional, just to inspect Qwen-only performance)
(SUB_ROOT / "subtask_1").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_2").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_3").mkdir(parents=True, exist_ok=True)

# Label order for multi-label tasks (same as other methods)
T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]
T3_LABELS = ["vilification", "extreme_language", "stereotype",
             "invalidation", "lack_of_empathy", "dehumanization"]

print(f"LANG={LANG}")
print("T2_LABELS:", T2_LABELS)
print("T3_LABELS:", T3_LABELS)

# ------------------------------------------------------------
# Load Qwen2.5-7B-Instruct via pipeline
# ------------------------------------------------------------
print("\nLoading Qwen2.5-7B-Instruct... (this may take a moment)")

# Use bfloat16 on GPU if available, else float32 on CPU
dtype = torch.bfloat16 if DEVICE.type == "cuda" else torch.float32

llm_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, use_fast=True)
llm_model = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_NAME,
    torch_dtype=dtype,
    device_map="auto" if DEVICE.type == "cuda" else None,
)
llm_model.eval()

llm_pipe = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=llm_tokenizer,
    # pipeline will handle device via model.device
)

print("Qwen pipeline ready on device:", llm_model.device)


2025-12-08 14:16:48.044941: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-08 14:16:48.058057: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765232208.071634   11074 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765232208.075709   11074 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765232208.088426   11074 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

PyTorch: 2.9.0
Transformers: 4.57.1
Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb
LANG=eng
T2_LABELS: ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
T3_LABELS: ['vilification', 'extreme_language', 'stereotype', 'invalidation', 'lack_of_empathy', 'dehumanization']

Loading Qwen2.5-7B-Instruct... (this may take a moment)


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.48s/it]
Device set to use cuda:0


Qwen pipeline ready on device: cuda:0


## Helpers: prompts, parsers, batch runner

In [3]:
# ## Helpers: prompts, parsers, batch inference

def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)

# ------------------------------------------------------------
# Prompt builders
# ------------------------------------------------------------
def build_prompt_t1(text: str, lang: str) -> str:
    """
    Subtask 1: binary polarization.
    Output format: single digit 0 or 1.
    """
    return f"""
You are an expert annotator for online hate and polarization detection.

Language code of the text: {lang}

Task:
Given the following social media post, decide whether it contains hateful, abusive, or strongly polarizing content.

Classes:
0 = NON_HATE / NOT_POLARIZING (neutral, non-hateful content)
1 = HATE / POLARIZING (hateful, abusive, or strongly polarizing content)

Instructions:
- Read the post carefully.
- Decide which class is more appropriate.
- Answer with ONLY ONE DIGIT: 0 or 1.
- Do not include any additional words or explanation.

Post:
{text}

Answer (0 or 1 only):
""".strip()


def build_prompt_t2(text: str, lang: str) -> str:
    """
    Subtask 2: multi-label 5-way.
    Output format: 5 digits (0/1) in fixed order.
    """
    return f"""
You are an expert annotator for hate type classification in online text.

Language code of the text: {lang}

Task:
Given the social media post, decide which hate types are present.
There can be multiple hate types at the same time, or none.

Label order (5 labels):
1) gender/sexual
2) political
3) religious
4) racial/ethnic
5) other

Output format:
Return EXACTLY 5 digits, each 0 or 1, separated by spaces.
- 1 means the hate type is present.
- 0 means the hate type is not present.
For example: "0 1 0 0 1"

Important:
- Return ONLY the 5 digits, nothing else.
- Do NOT write label names or explanations.

Post:
{text}

Answer (5 digits for the 5 labels, in order):
""".strip()


def build_prompt_t3(text: str, lang: str) -> str:
    """
    Subtask 3: multi-label 6-way.
    Output format: 6 digits (0/1) in fixed order.
    """
    return f"""
You are an expert annotator for manifestations of hate in online text.

Language code of the text: {lang}

Task:
Given the social media post, decide which manifestations of hate are present.
There can be multiple manifestations at the same time, or none.

Label order (6 labels):
1) vilification
2) extreme_language
3) stereotype
4) invalidation
5) lack_of_empathy
6) dehumanization

Output format:
Return EXACTLY 6 digits, each 0 or 1, separated by spaces.
- 1 means the manifestation is present.
- 0 means it is not present.
For example: "0 1 0 0 1 0"

Important:
- Return ONLY the 6 digits, nothing else.
- Do NOT write label names or explanations.

Post:
{text}

Answer (6 digits for the 6 labels, in order):
""".strip()

# ------------------------------------------------------------
# Output parsers
# ------------------------------------------------------------
def parse_t1_output(text: str) -> int:
    """
    Extract first occurrence of 0 or 1 from the model output.
    Default to 0 if nothing found.
    """
    if text is None:
        return 0
    match = re.search(r"[01]", str(text))
    if match:
        return int(match.group(0))
    return 0


def parse_digit_vector(text: str, n_labels: int) -> np.ndarray:
    """
    Extract first n_labels digits (0/1) from the model output.
    Pad with zeros if fewer digits, truncate if more.
    """
    if text is None:
        return np.zeros(n_labels, dtype=int)
    digits = re.findall(r"[01]", str(text))
    if len(digits) < n_labels:
        digits = digits + ["0"] * (n_labels - len(digits))
    elif len(digits) > n_labels:
        digits = digits[:n_labels]
    return np.array([int(d) for d in digits], dtype=int)

# ------------------------------------------------------------
# Batch inference helper
# ------------------------------------------------------------
def qwen_generate_batch(prompts: List[str],
                        max_new_tokens: int = 32,
                        batch_size: int = 4) -> List[str]:
    """
    Run Qwen on a list of prompts using the chat-style pipeline.
    Returns list of raw assistant outputs (strings).
    """
    all_outputs = []
    for start in range(0, len(prompts), batch_size):
        batch_prompts = prompts[start:start + batch_size]
        messages_batch = [
            [{"role": "user", "content": p}] for p in batch_prompts
        ]
        outputs = llm_pipe(
            messages_batch,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
        for out in outputs:
            try:
                msg = out["generated_text"][-1]["content"]
            except Exception:
                msg = str(out)
            all_outputs.append(msg)
    assert len(all_outputs) == len(prompts)
    return all_outputs


## Subtask 1 (binary) with Qwen

In [4]:
# ## Subtask 1 — Polarization (binary, Qwen)

# 1. Load TRAIN + DEV
t1_train_df = pd.read_csv(T1_TRAIN)
t1_dev_df   = pd.read_csv(T1_DEV)

required_train_cols_t1 = {"id", "text", "polarization"}
required_dev_cols_t1   = {"id", "text"}
assert required_train_cols_t1.issubset(t1_train_df.columns), \
    f"T1 TRAIN missing: {required_train_cols_t1 - set(t1_train_df.columns)}"
assert required_dev_cols_t1.issubset(t1_dev_df.columns), \
    f"T1 DEV missing: {required_dev_cols_t1 - set(t1_dev_df.columns)}"

t1_train_df["polarization"] = t1_train_df["polarization"].astype(int)

print(f"[T1] TRAIN size: {len(t1_train_df)}")
print(f"[T1] DEV size  : {len(t1_dev_df)}")

# 2. Build prompts
train_prompts_t1 = [
    build_prompt_t1(txt, LANG) for txt in t1_train_df["text"].astype(str).tolist()
]
dev_prompts_t1 = [
    build_prompt_t1(txt, LANG) for txt in t1_dev_df["text"].astype(str).tolist()
]

# 3. Run Qwen on TRAIN
print("\n[T1] Running Qwen on TRAIN...")
train_outputs_t1 = qwen_generate_batch(train_prompts_t1, max_new_tokens=8, batch_size=4)
pred_train_t1 = np.array([parse_t1_output(o) for o in train_outputs_t1], dtype=int)

# 4. Evaluate on TRAIN
y_true_t1 = t1_train_df["polarization"].values
f1_t1 = macro_f1(y_true_t1, pred_train_t1)
print("[T1] Qwen Macro-F1 on TRAIN (hard labels):", f1_t1)

# 5. Run Qwen on DEV
print("\n[T1] Running Qwen on DEV...")
dev_outputs_t1 = qwen_generate_batch(dev_prompts_t1, max_new_tokens=8, batch_size=4)
pred_dev_t1 = np.array([parse_t1_output(o) for o in dev_outputs_t1], dtype=int)

# 6. Save caches for ensemble (treat Qwen prediction as prob 0/1)
cache_t1_train = pd.DataFrame({
    "id":   t1_train_df["id"].astype(str).values,
    "prob_pos": pred_train_t1.astype(float),   # 0.0 or 1.0
    "label":   y_true_t1.astype(int),
})
cache_t1_dev = pd.DataFrame({
    "id":      t1_dev_df["id"].astype(str).values,
    "prob_pos": pred_dev_t1.astype(float),
})

t1_train_cache_path = CACHE_ROOT / "t1_train_probs.csv"
t1_dev_cache_path   = CACHE_ROOT / "t1_dev_probs.csv"
cache_t1_train.to_csv(t1_train_cache_path, index=False)
cache_t1_dev.to_csv(t1_dev_cache_path, index=False)

print("Saved T1 TRAIN cache:", t1_train_cache_path)
print("Saved T1 DEV cache  :", t1_dev_cache_path)

# 7. Optional: Qwen-only submission CSV for Subtask 1
sub1 = pd.DataFrame({
    "id":           t1_dev_df["id"].astype(str).values,
    "polarization": pred_dev_t1.astype(int),
})
sub1_path = SUB_ROOT / "subtask_1" / f"pred_{lang_fname}.csv"
sub1.to_csv(sub1_path, index=False)
print("Saved Qwen-only Subtask 1 submission:", sub1_path)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[T1] TRAIN size: 3222
[T1] DEV size  : 160

[T1] Running Qwen on TRAIN...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[T1] Qwen Macro-F1 on TRAIN (hard labels): 0.38849876636933006

[T1] Running Qwen on DEV...
Saved T1 TRAIN cache: cache/qwen/eng/t1_train_probs.csv
Saved T1 DEV cache  : cache/qwen/eng/t1_dev_probs.csv
Saved Qwen-only Subtask 1 submission: submissions/qwen/subtask_1/pred_eng.csv


## Subtask 2 (multi-label 5) with Qwen

In [5]:
# ## Subtask 2 — Hate type (multi-label 5, Qwen)

# 1. Load TRAIN + DEV
t2_train_df = pd.read_csv(T2_TRAIN)
t2_dev_df   = pd.read_csv(T2_DEV)

required_train_cols_t2 = {"id", "text", *T2_LABELS}
required_dev_cols_t2   = {"id", "text"}
assert required_train_cols_t2.issubset(t2_train_df.columns), \
    f"T2 TRAIN missing: {required_train_cols_t2 - set(t2_train_df.columns)}"
assert required_dev_cols_t2.issubset(t2_dev_df.columns), \
    f"T2 DEV missing: {required_dev_cols_t2 - set(t2_dev_df.columns)}"

Y2_train = t2_train_df[T2_LABELS].values.astype(int)

print(f"[T2] TRAIN size: {len(t2_train_df)}")
print(f"[T2] DEV size  : {len(t2_dev_df)}")

# 2. Build prompts
train_prompts_t2 = [
    build_prompt_t2(txt, LANG) for txt in t2_train_df["text"].astype(str).tolist()
]
dev_prompts_t2 = [
    build_prompt_t2(txt, LANG) for txt in t2_dev_df["text"].astype(str).tolist()
]

# 3. Run Qwen on TRAIN
print("\n[T2] Running Qwen on TRAIN...")
train_outputs_t2 = qwen_generate_batch(train_prompts_t2, max_new_tokens=16, batch_size=4)
pred_train_t2 = np.stack(
    [parse_digit_vector(o, n_labels=len(T2_LABELS)) for o in train_outputs_t2],
    axis=0,
)  # [N,5]

# 4. Evaluate on TRAIN (macro-F1 across labels)
f1_t2 = f1_score(Y2_train, pred_train_t2, average="macro", zero_division=0)
print("[T2] Qwen Macro-F1 on TRAIN (hard multi-label):", f1_t2)

# 5. Run Qwen on DEV
print("\n[T2] Running Qwen on DEV...")
dev_outputs_t2 = qwen_generate_batch(dev_prompts_t2, max_new_tokens=16, batch_size=4)
pred_dev_t2 = np.stack(
    [parse_digit_vector(o, n_labels=len(T2_LABELS)) for o in dev_outputs_t2],
    axis=0,
)  # [N_dev,5]

# 6. Save caches for ensemble (probabilities are just 0/1 from Qwen)
cache_cols_train_t2 = {"id": t2_train_df["id"].astype(str).values}
cache_cols_dev_t2   = {"id": t2_dev_df["id"].astype(str).values}

for j, lab in enumerate(T2_LABELS):
    cache_cols_train_t2[f"prob_{lab}"]  = pred_train_t2[:, j].astype(float)
    cache_cols_train_t2[f"label_{lab}"] = Y2_train[:, j].astype(int)
    cache_cols_dev_t2[f"prob_{lab}"]    = pred_dev_t2[:, j].astype(float)

t2_train_cache = pd.DataFrame(cache_cols_train_t2)
t2_dev_cache   = pd.DataFrame(cache_cols_dev_t2)

t2_train_cache_path = CACHE_ROOT / "t2_train_probs.csv"
t2_dev_cache_path   = CACHE_ROOT / "t2_dev_probs.csv"

t2_train_cache.to_csv(t2_train_cache_path, index=False)
t2_dev_cache.to_csv(t2_dev_cache_path, index=False)

print("Saved T2 TRAIN cache:", t2_train_cache_path)
print("Saved T2 DEV cache  :", t2_dev_cache_path)

# Required header: id,political,racial/ethnic,religious,gender/sexual,other

idx_gender    = T2_LABELS.index("gender/sexual")
idx_political = T2_LABELS.index("political")
idx_religious = T2_LABELS.index("religious")
idx_racial    = T2_LABELS.index("racial/ethnic")
idx_other     = T2_LABELS.index("other")

sub2 = pd.DataFrame({
    "id":            t2_dev_df["id"].astype(str).values,
    "political":     pred_dev_t2[:, idx_political],
    "racial/ethnic": pred_dev_t2[:, idx_racial],
    "religious":     pred_dev_t2[:, idx_religious],
    "gender/sexual": pred_dev_t2[:, idx_gender],
    "other":         pred_dev_t2[:, idx_other],
})
sub2_path = SUB_ROOT / "subtask_2" / f"pred_{lang_fname}.csv"
sub2.to_csv(sub2_path, index=False)
print("Saved Qwen-only Subtask 2 submission:", sub2_path)


[T2] TRAIN size: 3222
[T2] DEV size  : 160

[T2] Running Qwen on TRAIN...
[T2] Qwen Macro-F1 on TRAIN (hard multi-label): 0.05426726469403591

[T2] Running Qwen on DEV...
Saved T2 TRAIN cache: cache/qwen/eng/t2_train_probs.csv
Saved T2 DEV cache  : cache/qwen/eng/t2_dev_probs.csv
Saved Qwen-only Subtask 2 submission: submissions/qwen/subtask_2/pred_eng.csv


## Subtask 3 (multi-label 6) with Qwen

In [6]:
# 1. Load TRAIN + DEV
t3_train_df = pd.read_csv(T3_TRAIN)
t3_dev_df   = pd.read_csv(T3_DEV)

required_train_cols_t3 = {"id", "text", *T3_LABELS}
required_dev_cols_t3   = {"id", "text"}
assert required_train_cols_t3.issubset(t3_train_df.columns), \
    f"T3 TRAIN missing: {required_train_cols_t3 - set(t3_train_df.columns)}"
assert required_dev_cols_t3.issubset(t3_dev_df.columns), \
    f"T3 DEV missing: {required_dev_cols_t3 - set(t3_dev_df.columns)}"

Y3_train = t3_train_df[T3_LABELS].values.astype(int)

print(f"[T3] TRAIN size: {len(t3_train_df)}")
print(f"[T3] DEV size  : {len(t3_dev_df)}")

# 2. Build prompts
train_prompts_t3 = [
    build_prompt_t3(txt, LANG) for txt in t3_train_df["text"].astype(str).tolist()
]
dev_prompts_t3 = [
    build_prompt_t3(txt, LANG) for txt in t3_dev_df["text"].astype(str).tolist()
]

# 3. Run Qwen on TRAIN
print("\n[T3] Running Qwen on TRAIN...")
train_outputs_t3 = qwen_generate_batch(train_prompts_t3, max_new_tokens=16, batch_size=4)
pred_train_t3 = np.stack(
    [parse_digit_vector(o, n_labels=len(T3_LABELS)) for o in train_outputs_t3],
    axis=0,
)  # [N,6]

# 4. Evaluate on TRAIN
f1_t3 = f1_score(Y3_train, pred_train_t3, average="macro", zero_division=0)
print("[T3] Qwen Macro-F1 on TRAIN (hard multi-label):", f1_t3)

# 5. Run Qwen on DEV
print("\n[T3] Running Qwen on DEV...")
dev_outputs_t3 = qwen_generate_batch(dev_prompts_t3, max_new_tokens=16, batch_size=4)
pred_dev_t3 = np.stack(
    [parse_digit_vector(o, n_labels=len(T3_LABELS)) for o in dev_outputs_t3],
    axis=0,
)  # [N_dev,6]

# 6. Save caches for ensemble
cache_cols_train_t3 = {"id": t3_train_df["id"].astype(str).values}
cache_cols_dev_t3   = {"id": t3_dev_df["id"].astype(str).values}

for j, lab in enumerate(T3_LABELS):
    cache_cols_train_t3[f"prob_{lab}"]  = pred_train_t3[:, j].astype(float)
    cache_cols_train_t3[f"label_{lab}"] = Y3_train[:, j].astype(int)
    cache_cols_dev_t3[f"prob_{lab}"]    = pred_dev_t3[:, j].astype(float)

t3_train_cache = pd.DataFrame(cache_cols_train_t3)
t3_dev_cache   = pd.DataFrame(cache_cols_dev_t3)

t3_train_cache_path = CACHE_ROOT / "t3_train_probs.csv"
t3_dev_cache_path   = CACHE_ROOT / "t3_dev_probs.csv"

t3_train_cache.to_csv(t3_train_cache_path, index=False)
t3_dev_cache.to_csv(t3_dev_cache_path, index=False)

print("Saved T3 TRAIN cache:", t3_train_cache_path)
print("Saved T3 DEV cache  :", t3_dev_cache_path)

# 7. Qwen-only submission for Subtask 3
idx_vil      = T3_LABELS.index("vilification")
idx_extreme  = T3_LABELS.index("extreme_language")
idx_stereo   = T3_LABELS.index("stereotype")
idx_invalid  = T3_LABELS.index("invalidation")
idx_lackemp  = T3_LABELS.index("lack_of_empathy")
idx_dehum    = T3_LABELS.index("dehumanization")

sub3 = pd.DataFrame({
    "id":               t3_dev_df["id"].astype(str).values,
    "stereotype":       pred_dev_t3[:, idx_stereo],
    "vilification":     pred_dev_t3[:, idx_vil],
    "dehumanization":   pred_dev_t3[:, idx_dehum],
    "extreme_language": pred_dev_t3[:, idx_extreme],
    "lack_of_empathy":  pred_dev_t3[:, idx_lackemp],
    "invalidation":     pred_dev_t3[:, idx_invalid],
})
sub3_path = SUB_ROOT / "subtask_3" / f"pred_{lang_fname}.csv"
sub3.to_csv(sub3_path, index=False)
print("Saved Qwen-only Subtask 3 submission:", sub3_path)


[T3] TRAIN size: 3222
[T3] DEV size  : 160

[T3] Running Qwen on TRAIN...
[T3] Qwen Macro-F1 on TRAIN (hard multi-label): 0.16516097002295121

[T3] Running Qwen on DEV...
Saved T3 TRAIN cache: cache/qwen/eng/t3_train_probs.csv
Saved T3 DEV cache  : cache/qwen/eng/t3_dev_probs.csv
Saved Qwen-only Subtask 3 submission: submissions/qwen/subtask_3/pred_eng.csv
