## Setup, config, paths, model load

In [None]:
# # method3 — Qwen2.5-7B LLM predictions (few-shot, all subtasks)
#
# - Uses Qwen/Qwen2.5-7B-Instruct as a multilingual **few-shot classifier** (no fine-tuning).
# - Works directly on the original language (no translation).
# - For each subtask, builds a **label-balanced few-shot context** from TRAIN examples:
#   - Subtask 1: binary polarization.
#   - Subtask 2: multi-label (5 hate types).
#   - Subtask 3: multi-label (6 manifestations).
# - Outputs per-example predictions for TRAIN and DEV for all 3 subtasks.
# - Caches go into: `cache/qwen5shots/<LANG>/`
# - Optional Qwen-only submissions go into: `submissions/qwen5shots/subtask_X/`

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["WANDB_DISABLED"] = "true"

import random
import re
import json
import warnings
from pathlib import Path
from typing import List, Dict, Optional

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
)

import transformers
print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)

# ------------------------------------------------------------
# Device selection: GPU strongly recommended for Qwen
# ------------------------------------------------------------
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ------------------------------------------------------------
# Reproducibility
# ------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ------------------------------------------------------------
# High-level config
# ------------------------------------------------------------
# Change LANG when you switch language (eng, ben, hin, etc.)
LANG = "eng"

BASE = "../dev_phase"  # root of organizer data

# Qwen model:
QWEN_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

# Paths for data (TRAIN has labels, DEV is unlabeled for Codabench)
lang_fname = LANG

T1_TRAIN = f"{BASE}/subtask1/train/{lang_fname}.csv"
T1_DEV   = f"{BASE}/subtask1/dev/{lang_fname}.csv"

T2_TRAIN = f"{BASE}/subtask2/train/{lang_fname}.csv"
T2_DEV   = f"{BASE}/subtask2/dev/{lang_fname}.csv"

T3_TRAIN = f"{BASE}/subtask3/train/{lang_fname}.csv"
T3_DEV   = f"{BASE}/subtask3/dev/{lang_fname}.csv"

# Caches + outputs for Qwen few-shot
CACHE_ROOT = Path("cache") / "qwen5shots" / LANG
OUT_ROOT   = Path("outputs") / "qwen5shots" / LANG
SUB_ROOT   = Path("submissions") / "qwen5shots"

for d in [CACHE_ROOT, OUT_ROOT, SUB_ROOT]:
    d.mkdir(parents=True, exist_ok=True)

(SUB_ROOT / "subtask_1").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_2").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_3").mkdir(parents=True, exist_ok=True)

# Label order for multi-label tasks (same as other methods)
T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]
T3_LABELS = [
    "vilification",
    "extreme_language",
    "stereotype",
    "invalidation",
    "lack_of_empathy",
    "dehumanization",
]

print(f"LANG={LANG}")
print("T2_LABELS:", T2_LABELS)
print("T3_LABELS:", T3_LABELS)

# ------------------------------------------------------------
# Load Qwen2.5-7B-Instruct via pipeline
# ------------------------------------------------------------
print("\nLoading Qwen2.5-7B-Instruct... (this may take a moment)")

dtype = torch.bfloat16 if DEVICE.type == "cuda" else torch.float32

llm_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, use_fast=True)
llm_model = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_NAME,
    torch_dtype=dtype,
    device_map="auto" if DEVICE.type == "cuda" else None,
)
llm_model.eval()

llm_pipe = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=llm_tokenizer,
    # pipeline will handle device via model.device
)

print("Qwen pipeline ready on device:", llm_model.device)


2025-12-08 10:50:07.454455: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-08 10:50:19.303035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765219820.347298 4030100 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765219821.052863 4030100 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765219823.883024 4030100 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

PyTorch: 2.9.0
Transformers: 4.57.1
Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb
LANG=eng
T2_LABELS: ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
T3_LABELS: ['vilification', 'extreme_language', 'stereotype', 'invalidation', 'lack_of_empathy', 'dehumanization']

Loading Qwen2.5-7B-Instruct... (this may take a moment)


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.01s/it]
Device set to use cuda:0


Qwen pipeline ready on device: cuda:0


## Helpers: metrics, few-shot builders, prompts, parsers, batch runner

In [None]:
# ## Helpers: metrics, few-shot contexts, prompts, parsers, batch inference

def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)


# ------------------------------------------------------------
# Few-shot context builders (label-balanced)
# ------------------------------------------------------------
def build_fewshot_context_t1(df: pd.DataFrame, lang: str,
                             n_pos: int = 3, n_neg: int = 3) -> str:
    """
    Build a few-shot context for Subtask 1:
    - Aim for n_pos positive examples (polarization=1)
    - Aim for n_neg negative examples (polarization=0)
    """
    df = df.copy()
    df["polarization"] = df["polarization"].astype(int)

    df_pos = df[df["polarization"] == 1]
    df_neg = df[df["polarization"] == 0]

    rng = np.random.RandomState(42)
    examples = []

    if len(df_pos) > 0:
        n_pos = min(n_pos, len(df_pos))
        examples.append(df_pos.sample(n=n_pos, random_state=42))
    if len(df_neg) > 0:
        n_neg = min(n_neg, len(df_neg))
        examples.append(df_neg.sample(n=n_neg, random_state=43))

    if examples:
        ex_df = pd.concat(examples, ignore_index=True)
    else:
        # fallback: just sample up to 6 rows
        ex_df = df.sample(n=min(6, len(df)), random_state=42)

    ex_df = ex_df.reset_index(drop=True)

    lines = []
    for i, row in ex_df.iterrows():
        label = int(row["polarization"])
        txt = str(row["text"]).strip().replace("\n", " ")
        lines.append(
            f"Example {i+1}:\n"
            f"Post: {txt}\n"
            f"Label (0=non-hate, 1=hate/polarizing): {label}\n"
        )

    return "\n".join(lines).strip()


def build_fewshot_context_t2(df: pd.DataFrame, lang: str,
                             max_examples_per_label: int = 1,
                             n_negatives: int = 2) -> str:
    """
    Build a label-balanced few-shot context for Subtask 2 (5 labels).
    Goal:
      - For each label in T2_LABELS, pick up to `max_examples_per_label`
        examples where that label == 1.
        Prefer "focused" examples where only that label is 1.
      - Add up to `n_negatives` all-zero examples.
    """
    df = df.copy()
    for lab in T2_LABELS:
        df[lab] = df[lab].astype(int)

    rng = np.random.RandomState(42)
    used_indices = set()
    examples = []

    # Label-focused positives
    for lab in T2_LABELS:
        df_lab_pos = df[df[lab] == 1]
        if df_lab_pos.empty:
            continue

        # Prefer rows where this label is the ONLY 1
        df_lab_pure = df_lab_pos[df_lab_pos[T2_LABELS].sum(axis=1) == 1]

        if not df_lab_pure.empty:
            row = df_lab_pure.sample(n=1, random_state=42).iloc[0]
        else:
            # fallback: any row with lab==1
            row = df_lab_pos.sample(n=1, random_state=42).iloc[0]

        idx = row.name
        if idx in used_indices:
            continue
        used_indices.add(idx)
        examples.append(row)

    # Add negative examples (all labels = 0)
    if n_negatives > 0:
        df_neg = df[df[T2_LABELS].sum(axis=1) == 0]
        if not df_neg.empty:
            n_neg = min(n_negatives, len(df_neg))
            df_neg_sample = df_neg.sample(n=n_neg, random_state=123)
            for _, row in df_neg_sample.iterrows():
                idx = row.name
                if idx in used_indices:
                    continue
                used_indices.add(idx)
                examples.append(row)

    # If for some reason we still have no examples, fallback to random sampling
    if not examples:
        ex_df = df.sample(n=min(5, len(df)), random_state=42)
    else:
        ex_df = pd.DataFrame(examples)

    ex_df = ex_df.reset_index(drop=True)

    lines = []
    for i, row in ex_df.iterrows():
        txt = str(row["text"]).strip().replace("\n", " ")
        vec = [int(row[lab]) for lab in T2_LABELS]
        vec_str = " ".join(str(v) for v in vec)
        lines.append(
            f"Example {i+1}:\n"
            f"Post: {txt}\n"
            f"Labels (gender/sexual, political, religious, "
            f"racial/ethnic, other): {vec_str}\n"
        )

    return "\n".join(lines).strip()


def build_fewshot_context_t3(df: pd.DataFrame, lang: str,
                             max_examples_per_label: int = 1,
                             n_negatives: int = 2) -> str:
    """
    Build a label-balanced few-shot context for Subtask 3 (6 labels).
    Goal:
      - For each label in T3_LABELS, pick up to `max_examples_per_label`
        examples where that label == 1.
        Prefer "focused" examples where only that label is 1.
      - Add up to `n_negatives` all-zero examples.
    """
    df = df.copy()
    for lab in T3_LABELS:
        df[lab] = df[lab].astype(int)

    rng = np.random.RandomState(42)
    used_indices = set()
    examples = []

    # Label-focused positives
    for lab in T3_LABELS:
        df_lab_pos = df[df[lab] == 1]
        if df_lab_pos.empty:
            continue

        # Prefer rows where this label is the ONLY 1
        df_lab_pure = df_lab_pos[df_lab_pos[T3_LABELS].sum(axis=1) == 1]

        if not df_lab_pure.empty:
            row = df_lab_pure.sample(n=1, random_state=42).iloc[0]
        else:
            # fallback: any row with lab==1
            row = df_lab_pos.sample(n=1, random_state=42).iloc[0]

        idx = row.name
        if idx in used_indices:
            continue
        used_indices.add(idx)
        examples.append(row)

    # Add negative examples (all labels = 0)
    if n_negatives > 0:
        df_neg = df[df[T3_LABELS].sum(axis=1) == 0]
        if not df_neg.empty:
            n_neg = min(n_negatives, len(df_neg))
            df_neg_sample = df_neg.sample(n=n_neg, random_state=123)
            for _, row in df_neg_sample.iterrows():
                idx = row.name
                if idx in used_indices:
                    continue
                used_indices.add(idx)
                examples.append(row)

    if not examples:
        ex_df = df.sample(n=min(6, len(df)), random_state=42)
    else:
        ex_df = pd.DataFrame(examples)

    ex_df = ex_df.reset_index(drop=True)

    lines = []
    for i, row in ex_df.iterrows():
        txt = str(row["text"]).strip().replace("\n", " ")
        vec = [int(row[lab]) for lab in T3_LABELS]
        vec_str = " ".join(str(v) for v in vec)
        lines.append(
            f"Example {i+1}:\n"
            f"Post: {txt}\n"
            f"Labels (vilification, extreme_language, stereotype, "
            f"invalidation, lack_of_empathy, dehumanization): {vec_str}\n"
        )

    return "\n".join(lines).strip()


# ------------------------------------------------------------
# Few-shot prompt builders
# ------------------------------------------------------------
def build_prompt_t1(text: str, lang: str, fewshot_ctx: str) -> str:
    """
    Subtask 1: binary polarization, few-shot prompt.
    Output format: single digit 0 or 1.
    """
    return f"""
You are an expert annotator for online hate and polarization detection.

Language code of the text: {lang}

Task:
Given a social media post, decide whether it contains hateful, abusive, or strongly polarizing content.

Classes:
0 = NON_HATE / NOT_POLARIZING (neutral, non-hateful content)
1 = HATE / POLARIZING (hateful, abusive, or strongly polarizing content)

Below are some EXAMPLES with their correct labels:

{fewshot_ctx}

Now classify THIS NEW post.

Instructions:
- Read the post carefully.
- Decide which class is more appropriate.
- Answer with ONLY ONE DIGIT: 0 or 1.
- Do not include any additional words or explanation.

New post:
{text}

Answer (0 or 1 only):
""".strip()


def build_prompt_t2(text: str, lang: str, fewshot_ctx: str) -> str:
    """
    Subtask 2: multi-label 5-way, few-shot prompt.
    Output format: 5 digits (0/1) in fixed order.
    """
    return f"""
You are an expert annotator for hate type classification in online text.

Language code of the text: {lang}

Task:
Given a social media post, decide which hate types are present.
There can be multiple hate types at the same time, or none.

Label order (5 labels):
1) gender/sexual
2) political
3) religious
4) racial/ethnic
5) other

Below are some EXAMPLES with their correct 5-label vectors:
(Each vector is: gender/sexual, political, religious, racial/ethnic, other)

{fewshot_ctx}

Now classify THIS NEW post.

Output format:
Return EXACTLY 5 digits, each 0 or 1, separated by spaces.
- 1 means the hate type is present.
- 0 means the hate type is not present.
Example output: "0 1 0 0 1"

Important:
- Return ONLY the 5 digits, nothing else.
- Do NOT write label names or explanations.

New post:
{text}

Answer (5 digits for the 5 labels, in order):
""".strip()


def build_prompt_t3(text: str, lang: str, fewshot_ctx: str) -> str:
    """
    Subtask 3: multi-label 6-way, few-shot prompt.
    Output format: 6 digits (0/1) in fixed order.
    """
    return f"""
You are an expert annotator for manifestations of hate in online text.

Language code of the text: {lang}

Task:
Given a social media post, decide which manifestations of hate are present.
There can be multiple manifestations at the same time, or none.

Label order (6 labels):
1) vilification
2) extreme_language
3) stereotype
4) invalidation
5) lack_of_empathy
6) dehumanization

Below are some EXAMPLES with their correct 6-label vectors:
(Each vector is: vilification, extreme_language, stereotype,
 invalidation, lack_of_empathy, dehumanization)

{fewshot_ctx}

Now classify THIS NEW post.

Output format:
Return EXACTLY 6 digits, each 0 or 1, separated by spaces.
- 1 means the manifestation is present.
- 0 means it is not present.
Example output: "0 1 0 0 1 0"

Important:
- Return ONLY the 6 digits, nothing else.
- Do NOT write label names or explanations.

New post:
{text}

Answer (6 digits for the 6 labels, in order):
""".strip()


# ------------------------------------------------------------
# Output parsers (safer: look at last non-empty line)
# ------------------------------------------------------------
def parse_t1_output(text: str) -> int:
    """
    Extract first occurrence of 0 or 1 from the LAST non-empty line.
    Default to 0 if nothing found.
    """
    if text is None:
        return 0
    lines = [l.strip() for l in str(text).splitlines() if l.strip()]
    if not lines:
        return 0
    last = lines[-1]
    match = re.search(r"[01]", last)
    if match:
        return int(match.group(0))
    return 0


def parse_digit_vector(text: str, n_labels: int) -> np.ndarray:
    """
    Extract first n_labels digits (0/1) from the LAST non-empty line.
    Pad with zeros if fewer digits, truncate if more.
    """
    if text is None:
        return np.zeros(n_labels, dtype=int)

    lines = [l.strip() for l in str(text).splitlines() if l.strip()]
    if not lines:
        return np.zeros(n_labels, dtype=int)
    last = lines[-1]

    digits = re.findall(r"[01]", last)
    if len(digits) < n_labels:
        digits = digits + ["0"] * (n_labels - len(digits))
    elif len(digits) > n_labels:
        digits = digits[:n_labels]

    if not digits:
        return np.zeros(n_labels, dtype=int)
    return np.array([int(d) for d in digits], dtype=int)


# ------------------------------------------------------------
# Batch inference helper
# ------------------------------------------------------------
def qwen_generate_batch(
    prompts: List[str],
    max_new_tokens: int = 32,
    batch_size: int = 4,
) -> List[str]:
    """
    Run Qwen on a list of prompts using the chat-style pipeline.
    Returns list of raw assistant outputs (strings).
    """
    all_outputs = []
    for start in range(0, len(prompts), batch_size):
        batch_prompts = prompts[start:start + batch_size]
        messages_batch = [
            [{"role": "user", "content": p}] for p in batch_prompts
        ]
        outputs = llm_pipe(
            messages_batch,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
        for out in outputs:
            try:
                msg = out["generated_text"][-1]["content"]
            except Exception:
                msg = str(out)
            all_outputs.append(msg)

    assert len(all_outputs) == len(prompts)
    return all_outputs


## Subtask 1 (binary, few-shot Qwen)

In [None]:
# ## Subtask 1 — Polarization (binary, Qwen few-shot)

# 1. Load TRAIN + DEV
t1_train_df = pd.read_csv(T1_TRAIN)
t1_dev_df   = pd.read_csv(T1_DEV)

required_train_cols_t1 = {"id", "text", "polarization"}
required_dev_cols_t1   = {"id", "text"}
assert required_train_cols_t1.issubset(t1_train_df.columns), \
    f"T1 TRAIN missing: {required_train_cols_t1 - set(t1_train_df.columns)}"
assert required_dev_cols_t1.issubset(t1_dev_df.columns), \
    f"T1 DEV missing: {required_dev_cols_t1 - set(t1_dev_df.columns)}"

t1_train_df["polarization"] = t1_train_df["polarization"].astype(int)

print(f"[T1] TRAIN size: {len(t1_train_df)}")
print(f"[T1] DEV size  : {len(t1_dev_df)}")

# 2. Build few-shot context from TRAIN
fewshot_ctx_t1 = build_fewshot_context_t1(t1_train_df, LANG, n_pos=3, n_neg=3)
print("\n[T1] Few-shot context (truncated):\n")
print(fewshot_ctx_t1[:800], "...\n")

# 3. Build prompts for TRAIN and DEV
train_prompts_t1 = [
    build_prompt_t1(txt, LANG, fewshot_ctx_t1)
    for txt in t1_train_df["text"].astype(str).tolist()
]
dev_prompts_t1 = [
    build_prompt_t1(txt, LANG, fewshot_ctx_t1)
    for txt in t1_dev_df["text"].astype(str).tolist()
]

# 4. Run Qwen on TRAIN
print("\n[T1] Running Qwen few-shot on TRAIN...")
train_outputs_t1 = qwen_generate_batch(
    train_prompts_t1,
    max_new_tokens=8,
    batch_size=4,
)
pred_train_t1 = np.array([parse_t1_output(o) for o in train_outputs_t1], dtype=int)

# 5. Evaluate on TRAIN
y_true_t1 = t1_train_df["polarization"].values
f1_t1 = macro_f1(y_true_t1, pred_train_t1)
print("[T1] Qwen few-shot Macro-F1 on TRAIN (hard labels):", f1_t1)

# 6. Run Qwen on DEV
print("\n[T1] Running Qwen few-shot on DEV...")
dev_outputs_t1 = qwen_generate_batch(
    dev_prompts_t1,
    max_new_tokens=8,
    batch_size=4,
)
pred_dev_t1 = np.array([parse_t1_output(o) for o in dev_outputs_t1], dtype=int)

# 7. Save caches for ensemble (treat Qwen prediction as prob 0/1)
cache_t1_train = pd.DataFrame({
    "id":       t1_train_df["id"].astype(str).values,
    "prob_pos": pred_train_t1.astype(float),  # 0.0 or 1.0
    "label":    y_true_t1.astype(int),
})
cache_t1_dev = pd.DataFrame({
    "id":       t1_dev_df["id"].astype(str).values,
    "prob_pos": pred_dev_t1.astype(float),
})

t1_train_cache_path = CACHE_ROOT / "t1_train_probs.csv"
t1_dev_cache_path   = CACHE_ROOT / "t1_dev_probs.csv"
cache_t1_train.to_csv(t1_train_cache_path, index=False)
cache_t1_dev.to_csv(t1_dev_cache_path, index=False)

print("Saved T1 TRAIN cache:", t1_train_cache_path)
print("Saved T1 DEV cache  :", t1_dev_cache_path)

sub1 = pd.DataFrame({
    "id":           t1_dev_df["id"].astype(str).values,
    "polarization": pred_dev_t1.astype(int),
})
sub1_path = SUB_ROOT / "subtask_1" / f"pred_{lang_fname}.csv"
sub1.to_csv(sub1_path, index=False)
print("Saved Qwen few-shot Subtask 1 submission:", sub1_path)


[T1] TRAIN size: 3222
[T1] DEV size  : 160

[T1] Few-shot context (truncated):

Example 1:
Post: "Ceasefire" lol not even close, its just palestines unconditional surrender theyre repackaging as a ceasefire to make it look better.
Label (0=non-hate, 1=hate/polarizing): 1

Example 2:
Post: So we going with the rigged election line for other countries now too ?
Label (0=non-hate, 1=hate/polarizing): 1

Example 3:
Post: How would you feel about a mass deportation of my sperm?
Label (0=non-hate, 1=hate/polarizing): 1

Example 4:
Post: Theodor Herzl, the father of Zionism speaks
Label (0=non-hate, 1=hate/polarizing): 0

Example 5:
Post: As Joe Biden leaves the White House today, I reflect on his life and legacy.
Label (0=non-hate, 1=hate/polarizing): 0

Example 6:
Post: The 2025 shareholder meeting will be in a few months. Shareholders should definitely exercise their voti ...


[T1] Running Qwen few-shot on TRAIN...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[T1] Qwen few-shot Macro-F1 on TRAIN (hard labels): 0.38849876636933006

[T1] Running Qwen few-shot on DEV...
Saved T1 TRAIN cache: cache/qwen5shots/eng/t1_train_probs.csv
Saved T1 DEV cache  : cache/qwen5shots/eng/t1_dev_probs.csv
Saved Qwen few-shot Subtask 1 submission: submissions/qwen5shots/subtask_1/pred_eng.csv


## Subtask 2 (multi-label 5, few-shot Qwen)

In [None]:
# ## Subtask 2 — Hate type (multi-label 5, Qwen few-shot)

# 1. Load TRAIN + DEV
t2_train_df = pd.read_csv(T2_TRAIN)
t2_dev_df   = pd.read_csv(T2_DEV)

required_train_cols_t2 = {"id", "text", *T2_LABELS}
required_dev_cols_t2   = {"id", "text"}
assert required_train_cols_t2.issubset(t2_train_df.columns), \
    f"T2 TRAIN missing: {required_train_cols_t2 - set(t2_train_df.columns)}"
assert required_dev_cols_t2.issubset(t2_dev_df.columns), \
    f"T2 DEV missing: {required_dev_cols_t2 - set(t2_dev_df.columns)}"

Y2_train = t2_train_df[T2_LABELS].values.astype(int)

print(f"[T2] TRAIN size: {len(t2_train_df)}")
print(f"[T2] DEV size  : {len(t2_dev_df)}")

# 2. Build few-shot context from TRAIN (label-balanced)
fewshot_ctx_t2 = build_fewshot_context_t2(
    t2_train_df,
    LANG,
    max_examples_per_label=1,
    n_negatives=2,
)
print("\n[T2] Few-shot context (truncated):\n")
print(fewshot_ctx_t2[:800], "...\n")

# 3. Build prompts
train_prompts_t2 = [
    build_prompt_t2(txt, LANG, fewshot_ctx_t2)
    for txt in t2_train_df["text"].astype(str).tolist()
]
dev_prompts_t2 = [
    build_prompt_t2(txt, LANG, fewshot_ctx_t2)
    for txt in t2_dev_df["text"].astype(str).tolist()
]

# 4. Run Qwen on TRAIN
print("\n[T2] Running Qwen few-shot on TRAIN...")
train_outputs_t2 = qwen_generate_batch(
    train_prompts_t2,
    max_new_tokens=16,
    batch_size=4,
)
pred_train_t2 = np.stack(
    [parse_digit_vector(o, n_labels=len(T2_LABELS)) for o in train_outputs_t2],
    axis=0,
)  # [N,5]

# 5. Evaluate on TRAIN (macro-F1 across labels)
f1_t2 = f1_score(Y2_train, pred_train_t2, average="macro", zero_division=0)
print("[T2] Qwen few-shot Macro-F1 on TRAIN (hard multi-label):", f1_t2)

# 6. Run Qwen on DEV
print("\n[T2] Running Qwen few-shot on DEV...")
dev_outputs_t2 = qwen_generate_batch(
    dev_prompts_t2,
    max_new_tokens=16,
    batch_size=4,
)
pred_dev_t2 = np.stack(
    [parse_digit_vector(o, n_labels=len(T2_LABELS)) for o in dev_outputs_t2],
    axis=0,
)  # [N_dev,5]

# 7. Save caches for ensemble (probabilities are just 0/1 from Qwen)
cache_cols_train_t2 = {
    "id": t2_train_df["id"].astype(str).values
}
cache_cols_dev_t2 = {
    "id": t2_dev_df["id"].astype(str).values
}

for j, lab in enumerate(T2_LABELS):
    cache_cols_train_t2[f"prob_{lab}"]  = pred_train_t2[:, j].astype(float)
    cache_cols_train_t2[f"label_{lab}"] = Y2_train[:, j].astype(int)
    cache_cols_dev_t2[f"prob_{lab}"]    = pred_dev_t2[:, j].astype(float)

t2_train_cache = pd.DataFrame(cache_cols_train_t2)
t2_dev_cache   = pd.DataFrame(cache_cols_dev_t2)

t2_train_cache_path = CACHE_ROOT / "t2_train_probs.csv"
t2_dev_cache_path   = CACHE_ROOT / "t2_dev_probs.csv"

t2_train_cache.to_csv(t2_train_cache_path, index=False)
t2_dev_cache.to_csv(t2_dev_cache_path, index=False)

print("Saved T2 TRAIN cache:", t2_train_cache_path)
print("Saved T2 DEV cache  :", t2_dev_cache_path)

# Required header: id,political,racial/ethnic,religious,gender/sexual,other

idx_gender    = T2_LABELS.index("gender/sexual")
idx_political = T2_LABELS.index("political")
idx_religious = T2_LABELS.index("religious")
idx_racial    = T2_LABELS.index("racial/ethnic")
idx_other     = T2_LABELS.index("other")

sub2 = pd.DataFrame({
    "id":            t2_dev_df["id"].astype(str).values,
    "political":     pred_dev_t2[:, idx_political],
    "racial/ethnic": pred_dev_t2[:, idx_racial],
    "religious":     pred_dev_t2[:, idx_religious],
    "gender/sexual": pred_dev_t2[:, idx_gender],
    "other":         pred_dev_t2[:, idx_other],
})
sub2_path = SUB_ROOT / "subtask_2" / f"pred_{lang_fname}.csv"
sub2.to_csv(sub2_path, index=False)
print("Saved Qwen few-shot Subtask 2 submission:", sub2_path)


[T2] TRAIN size: 3222
[T2] DEV size  : 160

[T2] Few-shot context (truncated):

Example 1:
Post: CriTiQuE gEnDeR iDeOLoGy harass trans people and their families
Labels (gender/sexual, political, religious, racial/ethnic, other): 1 0 0 0 0

Example 2:
Post: Isnt there supposed to be proPalestine activists on this platform? Dont you want to say "Free Palestine" without getting banned?
Labels (gender/sexual, political, religious, racial/ethnic, other): 0 1 0 0 0

Example 3:
Post: Christian nationalism is just fascism wrapped in a flag standing on a Bible, but all the pages of the Bible are blank.
Labels (gender/sexual, political, religious, racial/ethnic, other): 0 0 1 0 0

Example 4:
Post: I mean he def puts the X in xenophobia.
Labels (gender/sexual, political, religious, racial/ethnic, other): 0 0 0 1 0

Example 5:
Post: Its the NY Post. Theyre literally fake news.
La ...


[T2] Running Qwen few-shot on TRAIN...
[T2] Qwen few-shot Macro-F1 on TRAIN (hard multi-label): 0.127395486510965

## Subtask 3 (multi-label 6, few-shot Qwen)

In [None]:
# ## Subtask 3 — Manifestation (multi-label 6, Qwen few-shot)

# 1. Load TRAIN + DEV
t3_train_df = pd.read_csv(T3_TRAIN)
t3_dev_df   = pd.read_csv(T3_DEV)

required_train_cols_t3 = {"id", "text", *T3_LABELS}
required_dev_cols_t3   = {"id", "text"}
assert required_train_cols_t3.issubset(t3_train_df.columns), \
    f"T3 TRAIN missing: {required_train_cols_t3 - set(t3_train_df.columns)}"
assert required_dev_cols_t3.issubset(t3_dev_df.columns), \
    f"T3 DEV missing: {required_dev_cols_t3 - set(t3_dev_df.columns)}"

Y3_train = t3_train_df[T3_LABELS].values.astype(int)

print(f"[T3] TRAIN size: {len(t3_train_df)}")
print(f"[T3] DEV size  : {len(t3_dev_df)}")

# 2. Build few-shot context from TRAIN (label-balanced)
fewshot_ctx_t3 = build_fewshot_context_t3(
    t3_train_df,
    LANG,
    max_examples_per_label=1,
    n_negatives=2,
)
print("\n[T3] Few-shot context (truncated):\n")
print(fewshot_ctx_t3[:800], "...\n")

# 3. Build prompts
train_prompts_t3 = [
    build_prompt_t3(txt, LANG, fewshot_ctx_t3)
    for txt in t3_train_df["text"].astype(str).tolist()
]
dev_prompts_t3 = [
    build_prompt_t3(txt, LANG, fewshot_ctx_t3)
    for txt in t3_dev_df["text"].astype(str).tolist()
]

# 4. Run Qwen on TRAIN
print("\n[T3] Running Qwen few-shot on TRAIN...")
train_outputs_t3 = qwen_generate_batch(
    train_prompts_t3,
    max_new_tokens=16,
    batch_size=4,
)
pred_train_t3 = np.stack(
    [parse_digit_vector(o, n_labels=len(T3_LABELS)) for o in train_outputs_t3],
    axis=0,
)  # [N,6]

# 5. Evaluate on TRAIN
f1_t3 = f1_score(Y3_train, pred_train_t3, average="macro", zero_division=0)
print("[T3] Qwen few-shot Macro-F1 on TRAIN (hard multi-label):", f1_t3)

# 6. Run Qwen on DEV
print("\n[T3] Running Qwen few-shot on DEV...")
dev_outputs_t3 = qwen_generate_batch(
    dev_prompts_t3,
    max_new_tokens=16,
    batch_size=4,
)
pred_dev_t3 = np.stack(
    [parse_digit_vector(o, n_labels=len(T3_LABELS)) for o in dev_outputs_t3],
    axis=0,
)  # [N_dev,6]

# 7. Save caches for ensemble
cache_cols_train_t3 = {
    "id": t3_train_df["id"].astype(str).values
}
cache_cols_dev_t3 = {
    "id": t3_dev_df["id"].astype(str).values
}

for j, lab in enumerate(T3_LABELS):
    cache_cols_train_t3[f"prob_{lab}"]  = pred_train_t3[:, j].astype(float)
    cache_cols_train_t3[f"label_{lab}"] = Y3_train[:, j].astype(int)
    cache_cols_dev_t3[f"prob_{lab}"]    = pred_dev_t3[:, j].astype(float)

t3_train_cache = pd.DataFrame(cache_cols_train_t3)
t3_dev_cache   = pd.DataFrame(cache_cols_dev_t3)

t3_train_cache_path = CACHE_ROOT / "t3_train_probs.csv"
t3_dev_cache_path   = CACHE_ROOT / "t3_dev_probs.csv"

t3_train_cache.to_csv(t3_train_cache_path, index=False)
t3_dev_cache.to_csv(t3_dev_cache_path, index=False)

print("Saved T3 TRAIN cache:", t3_train_cache_path)
print("Saved T3 DEV cache  :", t3_dev_cache_path)

# Required header:
#   id,stereotype,vilification,dehumanization,
#   extreme_language,lack_of_empathy,invalidation

idx_vil      = T3_LABELS.index("vilification")
idx_extreme  = T3_LABELS.index("extreme_language")
idx_stereo   = T3_LABELS.index("stereotype")
idx_invalid  = T3_LABELS.index("invalidation")
idx_lackemp  = T3_LABELS.index("lack_of_empathy")
idx_dehum    = T3_LABELS.index("dehumanization")

sub3 = pd.DataFrame({
    "id":               t3_dev_df["id"].astype(str).values,
    "stereotype":       pred_dev_t3[:, idx_stereo],
    "vilification":     pred_dev_t3[:, idx_vil],
    "dehumanization":   pred_dev_t3[:, idx_dehum],
    "extreme_language": pred_dev_t3[:, idx_extreme],
    "lack_of_empathy":  pred_dev_t3[:, idx_lackemp],
    "invalidation":     pred_dev_t3[:, idx_invalid],
})
sub3_path = SUB_ROOT / "subtask_3" / f"pred_{lang_fname}.csv"
sub3.to_csv(sub3_path, index=False)
print("Saved Qwen few-shot Subtask 3 submission:", sub3_path)


[T3] TRAIN size: 3222
[T3] DEV size  : 160

[T3] Few-shot context (truncated):

Example 1:
Post: This is our border security, were waiting for the fat fascist whenever hes ready.
Labels (vilification, extreme_language, stereotype, invalidation, lack_of_empathy, dehumanization): 1 0 0 0 0 0

Example 2:
Post: AMERICANS MUST TAKE TO THE STREETS IN SUPPORT OF ALL MILITARY AID TO THE UKRAINIANS!
Labels (vilification, extreme_language, stereotype, invalidation, lack_of_empathy, dehumanization): 0 1 0 0 0 0

Example 3:
Post: Time for a lasting ceasefire. And arrest Netanyahu
Labels (vilification, extreme_language, stereotype, invalidation, lack_of_empathy, dehumanization): 0 0 1 0 0 0

Example 4:
Post: YIAY2025 Donald Trump is unelected president
Labels (vilification, extreme_language, stereotype, invalidation, lack_of_empathy, dehumanization): 0 0 0 1 0 0

Example 5:
Post:  ...


[T3] Running Qwen few-shot on TRAIN...
[T3] Qwen few-shot Macro-F1 on TRAIN (hard multi-label): 0.178160708996697

: 