## Setup, config, paths, and model load

In [1]:
# # Qwen Data Augmentation (with Resume, Subtask1-only paraphrasing)
#
# - Uses Qwen2.5-7B-Instruct to paraphrase Subtask 1 training data.
# - Writes augmented CSVs into `../dev_phase_aug`.
# - Can resume per row using:
#   - `cache/qwen_aug_progress/{LANG}/subtask1/paraphrases.csv`
#   - `cache/qwen_aug_progress/{LANG}/subtask1/progress.json`
# - For Subtask 2 and 3, we do not paraphrase again:
#   - All three subtasks share the same rows / texts.
#   - We:
#       1. Read the augmented Subtask 1 train CSV (original + Qwen paraphrases).
#       2. Strip `_augX` to get `base_id`.
#       3. Copy multi-labels from original Subtask 2/3 train files based on `base_id`.
# - Final structure:
#   - `../dev_phase_aug/subtask1/train/{LANG}.csv`   (original + Qwen paraphrases)
#   - `../dev_phase_aug/subtask2/train/{LANG}.csv`   (labels propagated from original T2)
#   - `../dev_phase_aug/subtask3/train/{LANG}.csv`   (labels propagated from original T3)
# - After running this notebook:
#   - In your XLM-R / DeBERTa / ensemble notebooks, set:
#       `BASE = "../dev_phase_aug"`

import os
import random
import json
import warnings
from pathlib import Path
from typing import List, Optional
import re

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch

try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **kwargs: x  # fallback: no progress bar

from transformers import AutoTokenizer, AutoModelForCausalLM

print("PyTorch:", torch.__version__)

# ------------------------------------------------------------
# Device selection (Qwen generation)
# ------------------------------------------------------------
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ------------------------------------------------------------
# Reproducibility
# ------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ------------------------------------------------------------
# Dataset + language config
# ------------------------------------------------------------
# Set this to the language you want to augment: "eng", "ben", "hin"
LANG = "eng"
lang_fname = LANG

# Original data root (same as method1/method2 notebooks)
BASE = Path("../dev_phase")

# New augmented data root
AUG_BASE = Path("../dev_phase_aug")

# Mirror of original structure:
T1_TRAIN_IN  = BASE     / "subtask1" / "train" / f"{lang_fname}.csv"
T2_TRAIN_IN  = BASE     / "subtask2" / "train" / f"{lang_fname}.csv"
T3_TRAIN_IN  = BASE     / "subtask3" / "train" / f"{lang_fname}.csv"

T1_TRAIN_OUT = AUG_BASE / "subtask1" / "train" / f"{lang_fname}.csv"
T2_TRAIN_OUT = AUG_BASE / "subtask2" / "train" / f"{lang_fname}.csv"
T3_TRAIN_OUT = AUG_BASE / "subtask3" / "train" / f"{lang_fname}.csv"

for p in [T1_TRAIN_OUT, T2_TRAIN_OUT, T3_TRAIN_OUT]:
    p.parent.mkdir(parents=True, exist_ok=True)

# Label orders (must match your other notebooks)
T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]
T3_LABELS = ["vilification", "extreme_language", "stereotype",
             "invalidation", "lack_of_empathy", "dehumanization"]

# ------------------------------------------------------------
# Augmentation hyperparameters
# ------------------------------------------------------------

# How many paraphrases per selected example? (used for Subtask 1 only)
NUM_PARAPHRASES_T1 = 2

# Maximum number of original examples to augment in Subtask 1 (None = no limit)
MAX_SAMPLES_T1 = None

print("LANG =", LANG)
print("Original base:", BASE)
print("Augmented base:", AUG_BASE)

# ------------------------------------------------------------
# Load Qwen model & tokenizer
# ------------------------------------------------------------
QWEN_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading Qwen model: {QWEN_MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME)

dtype = torch.bfloat16 if DEVICE.type == "cuda" else torch.float32
model = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_NAME,
    torch_dtype=dtype,
)
model.to(DEVICE)
model.eval()

print("Qwen loaded and ready.")


PyTorch: 2.9.0
Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb
LANG = eng
Original base: ../dev_phase
Augmented base: ../dev_phase_aug
Loading Qwen model: Qwen/Qwen2.5-7B-Instruct


`torch_dtype` is deprecated! Use `dtype` instead!
2025-12-08 15:52:59.479302: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-08 15:52:59.492994: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765237979.502724   68492 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765237979.505387   68492 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765237979.514708   68492 computation_placer.cc:177] computation placer already r

Qwen loaded and ready.


## Qwen paraphrasing helpers

In [2]:
# ## Paraphrasing helpers using Qwen (sentence-by-sentence, with quality checks)
#
# Key ideas:
# - Split each comment into sentences.
# - Paraphrase one sentence at a time with Qwen.
# - For each example, build N paraphrases by paraphrasing each sentence N times and concatenating.
# - Use relative length heuristics so short sentences are allowed, but we avoid tiny fragments.

MAX_QWEN_RETRIES = 3          # retries per sentence
MIN_ABS_CHARS     = 5         # absolute minimum for very short sentences
MIN_REL_CHARS     = 0.4       # paraphrase must be at least 40% of original length (chars)


def split_into_sentences(text: str) -> List[str]:
    """
    Very simple multilingual-ish sentence splitter:
    - Splits on ., !, ?, or Devanagari danda '।'
    - Keeps the punctuation with the sentence if present.
    Works well enough for short social media comments.
    """
    text = str(text).strip()
    if not text:
        return []

    # Pattern: one or more non-terminal chars followed by optional punctuation
    pattern = r'[^.!?।]+[.!?।]*'
    sentences = [m.group().strip() for m in re.finditer(pattern, text) if m.group().strip()]

    # Fallback: if the regex somehow fails, just return the whole text as one sentence
    if not sentences:
        return [text]
    return sentences


def qwen_generate(prompt: str, max_new_tokens: int = 64) -> str:
    """
    Run Qwen in chat mode and return the raw assistant text.
    Assumes `tokenizer`, `model`, and `DEVICE` are already defined.
    """
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant that paraphrases short social media sentences "
                "for a hate-speech and toxicity detection dataset. "
                "Always answer with a single paraphrased sentence, not explanations."
            ),
        },
        {
            "role": "user",
            "content": prompt,
        },
    ]

    chat_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(chat_text, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Slice off the prompt tokens
    generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
    out_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return out_text.strip()


def build_single_sentence_prompt(
    sentence: str,
    lang: str,
    labels_description: str,
) -> str:
    """
    Prompt for a single sentence.
    We explicitly require one paraphrased sentence in the same language.
    """
    prompt = f"""
You are rewriting a single short social media sentence for a toxicity and hate-speech detection dataset.

Language: {lang}
Original sentence:
{sentence}

Labels for this example:
{labels_description}

Task:
- Rewrite this ONE sentence in the SAME LANGUAGE ({lang}).
- Keep exactly the same meaning and labels (do NOT change whether this is hate speech or which categories apply).
- Make it sound natural for social media.
- Do NOT add explanations.
- Do NOT translate to any other language.

Output format:
- Write exactly ONE paraphrased sentence.
- Do NOT add numbering, bullet points, or any extra text.
""".strip()
    return prompt


def _clean_single_line(text: str) -> str:
    """
    Take Qwen's raw output and:
    - Keep only the first non-empty line.
    - Strip simple numbering like '1)' or '1.' at the start.
    """
    lines = [l.strip() for l in text.split("\n") if l.strip()]
    if not lines:
        return ""

    line = lines[0]
    line = re.sub(r"^[0-9]+[\).\-\:]\s*", "", line).strip()
    return line


def _is_good_paraphrase(candidate: str, original: str) -> bool:
    """
    Relative length-based filter:
    - candidate must not be empty.
    - candidate length >= max(MIN_ABS_CHARS, MIN_REL_CHARS * len(original)).
    This allows short sentences but rejects tiny fragments like 'टीवी से फ'.
    """
    if not isinstance(candidate, str):
        return False
    candidate = candidate.strip()
    if not candidate:
        return False

    orig = str(original).strip()
    if not orig:
        return False

    need_len = max(MIN_ABS_CHARS, int(len(orig) * MIN_REL_CHARS))
    if len(candidate) < need_len:
        return False

    return True


def paraphrase_single_sentence(
    sentence: str,
    lang: str,
    labels_description: str,
    max_new_tokens: int = 64,
    max_retries: int = MAX_QWEN_RETRIES,
) -> str:
    """
    Paraphrase ONE sentence with retries and relative-length filtering.
    If we consistently fail, we fall back to the original sentence.
    """
    sentence = str(sentence).strip()
    if not sentence:
        return sentence

    for attempt in range(max_retries):
        try:
            prompt = build_single_sentence_prompt(
                sentence=sentence,
                lang=lang,
                labels_description=labels_description,
            )
            raw = qwen_generate(prompt, max_new_tokens=max_new_tokens)
            candidate = _clean_single_line(raw)
            if _is_good_paraphrase(candidate, sentence):
                return candidate
        except Exception as e:
            print(f"[warn] Qwen single-sentence generation failed (attempt {attempt+1}): {e}")

    # Fallback: keep original sentence
    return sentence


def generate_paraphrases_for_example(
    text: str,
    lang: str,
    labels_description: str,
    num_paraphrases: int,
    max_new_tokens: int = 64,
) -> List[str]:
    """
    Main API used by augmentation functions.

    Steps:
    - Split comment into sentences.
    - For each of N paraphrases:
        - Paraphrase each sentence individually.
        - Join paraphrased sentences back into a full comment.
    - Return list of N paraphrased comments (may be fewer if something goes wrong).
    """
    sentences = split_into_sentences(text)
    if not sentences:
        return []

    all_paras: List[str] = []

    for _ in range(num_paraphrases):
        new_sentences = []
        for s in sentences:
            para_s = paraphrase_single_sentence(
                sentence=s,
                lang=lang,
                labels_description=labels_description,
                max_new_tokens=max_new_tokens,
            )
            new_sentences.append(para_s)

        paraphrased_comment = " ".join(new_sentences).strip()
        all_paras.append(paraphrased_comment)

    return all_paras


## Resume-able augmentation logic for all 3 subtasks

In [3]:
# ## Augment Subtask 1 with Qwen (row-level resume)
#
# - Reads original Subtask 1 train CSV from `../dev_phase/subtask1/train/{LANG}.csv`.
# - Writes augmented train CSV to `../dev_phase_aug/subtask1/train/{LANG}.csv`.
# - Keeps a per-language, per-subtask progress cache in:
#   - `cache/qwen_aug_progress/{LANG}/subtask1/paraphrases.csv`
#   - `cache/qwen_aug_progress/{LANG}/subtask1/progress.json`
# - If interrupted, just rerun: it resumes from where it left off (per original id).
# - It then rebuilds the final augmented train CSV from:
#     original + all paraphrases cached so far.

# Root for per-language augmentation progress
PROGRESS_ROOT = Path("cache") / "qwen_aug_progress" / LANG
PROGRESS_ROOT.mkdir(parents=True, exist_ok=True)

FLUSH_EVERY = 20  # flush paraphrases + progress every N source rows


# ------------------------ progress & cache helpers ------------------------ #

def _progress_file(subtask_id: int) -> Path:
    return PROGRESS_ROOT / f"subtask{subtask_id}" / "progress.json"


def _paraphrase_cache_file(subtask_id: int) -> Path:
    return PROGRESS_ROOT / f"subtask{subtask_id}" / "paraphrases.csv"


def _read_progress(subtask_id: int) -> set:
    """
    Return set of original ids already processed for this subtask.
    """
    p = _progress_file(subtask_id)
    if not p.exists():
        return set()
    with open(p, "r", encoding="utf-8") as f:
        data = json.load(f)
    # store done_ids as list of strings
    return set(data.get("done_ids", []))


def _write_progress(subtask_id: int, done_ids: set):
    """
    Save set of original ids already processed.
    """
    p = _progress_file(subtask_id)
    p.parent.mkdir(parents=True, exist_ok=True)
    with open(p, "w", encoding="utf-8") as f:
        json.dump({"done_ids": sorted(list(done_ids))}, f, indent=2)


def _append_paraphrases(subtask_id: int, df_rows: pd.DataFrame):
    """
    Append newly generated paraphrases to a cache CSV:
      cache/qwen_aug_progress/LANG/subtaskX/paraphrases.csv
    """
    if df_rows is None or len(df_rows) == 0:
        return
    cache_path = _paraphrase_cache_file(subtask_id)
    cache_path.parent.mkdir(parents=True, exist_ok=True)
    header = not cache_path.exists()
    df_rows.to_csv(cache_path, mode="a", index=False, header=header)


def _load_paraphrases(subtask_id: int) -> Optional[pd.DataFrame]:
    cache_path = _paraphrase_cache_file(subtask_id)
    if cache_path.exists():
        return pd.read_csv(cache_path)
    return None


def _rebuild_augmented_train(
    subtask_id: int,
    orig_df: pd.DataFrame,
    out_path: Path,
    keep_cols: list,
):
    """
    Combine original train df with all cached paraphrases and write final CSV.
    """
    df_par = _load_paraphrases(subtask_id)
    if df_par is not None and len(df_par) > 0:
        df_aug = pd.concat(
            [orig_df, df_par[keep_cols]],
            ignore_index=True,
        )
    else:
        df_aug = orig_df.copy()

    out_path.parent.mkdir(parents=True, exist_ok=True)
    df_aug.to_csv(out_path, index=False)
    print(
        f"[Subtask {subtask_id}] Rebuilt augmented TRAIN at {out_path} "
        f"(size={len(df_aug)})"
    )


# ------------------------ Subtask 1: polarization ------------------------ #

def augment_subtask1_for_lang(
    lang: str,
    num_paraphrases: int = 2,
    max_samples: Optional[int] = None,
) -> None:
    subtask_id = 1
    in_path = T1_TRAIN_IN
    out_path = T1_TRAIN_OUT

    print(f"[T1] Reading TRAIN from: {in_path}")
    df = pd.read_csv(in_path)
    assert {"id", "text", "polarization"}.issubset(df.columns)

    print(f"[T1] Original TRAIN size: {len(df)}")
    pos_mask = df["polarization"].astype(int) == 1
    df_pos = df[pos_mask].reset_index(drop=True)

    # optional cap on number of positives we augment
    if max_samples is not None and max_samples > 0 and len(df_pos) > max_samples:
        rs = np.random.RandomState(42)
        idx = rs.choice(len(df_pos), size=max_samples, replace=False)
        df_pos = df_pos.iloc[idx].reset_index(drop=True)

    print(
        f"[T1] Will augment {len(df_pos)} positive examples "
        f"with {num_paraphrases} paraphrases each."
    )

    # progress: which original ids are already done
    done_ids = _read_progress(subtask_id)
    print(f"[T1] Already completed examples (from cache): {len(done_ids)}")

    new_rows_buf = []

    for i in tqdm(range(len(df_pos)), desc="[T1] Augmenting", total=len(df_pos)):
        row = df_pos.iloc[i]
        orig_id = str(row["id"])
        if orig_id in done_ids:
            continue  # skip already processed example

        orig_text = str(row["text"])
        label = int(row["polarization"])
        label_name = "polarized" if label == 1 else "not polarized"
        labels_desc = f"polarization = {label} ({label_name})"

        paras = generate_paraphrases_for_example(
            text=orig_text,
            lang=lang,
            labels_description=labels_desc,
            num_paraphrases=num_paraphrases,
        )

        for k, p in enumerate(paras):
            new_rows_buf.append({
                "id": f"{orig_id}_aug{k+1}",
                "text": p,
                "polarization": label,
                "src_id": orig_id,
                "is_aug": 1,
            })

        done_ids.add(orig_id)

        # Periodically flush to disk so we don't lose progress on interrupt
        is_last = (i == len(df_pos) - 1)
        if len(new_rows_buf) >= FLUSH_EVERY or is_last:
            if new_rows_buf:
                df_flush = pd.DataFrame(new_rows_buf)
                _append_paraphrases(subtask_id, df_flush)
                new_rows_buf = []
            _write_progress(subtask_id, done_ids)

    # finally rebuild augmented train CSV (original + all paraphrases so far)
    _rebuild_augmented_train(
        subtask_id=subtask_id,
        orig_df=df,
        out_path=out_path,
        keep_cols=["id", "text", "polarization"],
    )


## Run augmentation for the chosen language

In [4]:
# ## Build augmented Subtask 2 & 3 from augmented Subtask 1
#
# Assumption: for a given `LANG`:
# - `subtask1/train/LANG.csv`, `subtask2/train/LANG.csv`, `subtask3/train/LANG.csv`
#   all have the same rows and the same `id`/`text` pairs in the original data.
# - We first run Qwen to paraphrase Subtask 1 and save:
#     `../dev_phase_aug/subtask1/train/LANG.csv`
#   which contains both original rows and rows with IDs like `<orig_id>_aug1`.
#
# Then:
# - For Subtask 2 and 3, we DO NOT paraphrase again.
# - Instead, we:
#   * Read the augmented Subtask 1 CSV (T1_TRAIN_OUT).
#   * Derive `base_id` by stripping `_augX` suffix.
#   * Join labels from the original Subtask 2/3 train CSV by `base_id`.
#   * Save new augmented Subtask 2/3 train CSVs into `../dev_phase_aug`.

def _add_base_id_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a 'base_id' column:
    - For original rows: base_id == id
    - For augmented rows like 'abc123_aug1': base_id == 'abc123'
    """
    df = df.copy()
    df["id"] = df["id"].astype(str)
    df["base_id"] = df["id"].str.replace(r"_aug\d+$", "", regex=True)
    return df


def build_augmented_subtask2_from_t1(lang: str) -> None:
    """
    Use augmented Subtask 1 CSV + original Subtask 2 labels
    to build augmented Subtask 2 train CSV.
    """
    # T1 augmented (already contains paraphrased text)
    t1_aug_path = T1_TRAIN_OUT
    if not t1_aug_path.exists():
        raise FileNotFoundError(
            f"[T2] Expected augmented Subtask 1 file not found: {t1_aug_path}\n"
            "Run augment_subtask1_for_lang(...) first."
        )

    # Original Subtask 2 train (labels source)
    t2_orig_path = T2_TRAIN_IN
    if not t2_orig_path.exists():
        raise FileNotFoundError(f"[T2] Original train file not found: {t2_orig_path}")

    print(f"[T2] Building augmented train from T1_aug.")
    print(f"     T1_aug: {t1_aug_path}")
    print(f"     T2_orig: {t2_orig_path}")

    t1_aug = pd.read_csv(t1_aug_path)
    t2_orig = pd.read_csv(t2_orig_path)

    assert {"id", "text"}.issubset(t1_aug.columns), "[T2] T1_aug must have 'id' and 'text'."
    assert {"id", "text", *T2_LABELS}.issubset(t2_orig.columns), \
        "[T2] original T2 train must have 'id', 'text' and all T2_LABELS."

    # Add base_id to T1_aug (original + augmented rows)
    t1_aug = _add_base_id_column(t1_aug)

    # Map base_id -> labels for T2
    t2_labels = t2_orig[["id"] + T2_LABELS].rename(columns={"id": "base_id"})

    # Join labels
    merged = t1_aug.merge(t2_labels, on="base_id", how="left", validate="m:1")

    # Check for any rows that didn't find labels (shouldn't happen if IDs match)
    missing = merged[T2_LABELS].isna().any(axis=1)
    if missing.any():
        n_miss = int(missing.sum())
        print(f"[T2] WARNING: {n_miss} rows in T1_aug had no matching labels in T2_orig. Dropping them.")
        merged = merged[~missing].copy()

    # Ensure integer label types
    for lab in T2_LABELS:
        merged[lab] = merged[lab].astype(int)

    # We can drop 'base_id' if you don't need it
    merged = merged.drop(columns=["base_id"])

    # Reorder columns: id, text, labels...
    cols = ["id", "text"] + T2_LABELS
    merged = merged[cols]

    # Save to augmented T2 path
    out_path = T2_TRAIN_OUT
    out_path.parent.mkdir(parents=True, exist_ok=True)
    merged.to_csv(out_path, index=False)
    print(f"[T2] Saved augmented TRAIN to: {out_path}  (rows={len(merged)})")


def build_augmented_subtask3_from_t1(lang: str) -> None:
    """
    Use augmented Subtask 1 CSV + original Subtask 3 labels
    to build augmented Subtask 3 train CSV.
    """
    # T1 augmented
    t1_aug_path = T1_TRAIN_OUT
    if not t1_aug_path.exists():
        raise FileNotFoundError(
            f"[T3] Expected augmented Subtask 1 file not found: {t1_aug_path}\n"
            "Run augment_subtask1_for_lang(...) first."
        )

    # Original Subtask 3
    t3_orig_path = T3_TRAIN_IN
    if not t3_orig_path.exists():
        raise FileNotFoundError(f"[T3] Original train file not found: {t3_orig_path}")

    print(f"[T3] Building augmented train from T1_aug.")
    print(f"     T1_aug: {t1_aug_path}")
    print(f"     T3_orig: {t3_orig_path}")

    t1_aug = pd.read_csv(t1_aug_path)
    t3_orig = pd.read_csv(t3_orig_path)

    assert {"id", "text"}.issubset(t1_aug.columns), "[T3] T1_aug must have 'id' and 'text'."
    assert {"id", "text", *T3_LABELS}.issubset(t3_orig.columns), \
        "[T3] original T3 train must have 'id', 'text' and all T3_LABELS."

    # Add base_id
    t1_aug = _add_base_id_column(t1_aug)

    # Map base_id -> labels for T3
    t3_labels = t3_orig[["id"] + T3_LABELS].rename(columns={"id": "base_id"})

    # Join labels
    merged = t1_aug.merge(t3_labels, on="base_id", how="left", validate="m:1")

    missing = merged[T3_LABELS].isna().any(axis=1)
    if missing.any():
        n_miss = int(missing.sum())
        print(f"[T3] WARNING: {n_miss} rows in T1_aug had no matching labels in T3_orig. Dropping them.")
        merged = merged[~missing].copy()

    for lab in T3_LABELS:
        merged[lab] = merged[lab].astype(int)

    merged = merged.drop(columns=["base_id"])
    cols = ["id", "text"] + T3_LABELS
    merged = merged[cols]

    out_path = T3_TRAIN_OUT
    out_path.parent.mkdir(parents=True, exist_ok=True)
    merged.to_csv(out_path, index=False)
    print(f"[T3] Saved augmented TRAIN to: {out_path}  (rows={len(merged)})")
# ## Run augmentation for this language (resume-aware + label propagation)
#
# Workflow:
# 1. Set `LANG` at the top (e.g. "eng", "ben", "hin").
# 2. Run all cells.
# 3. This cell:
#    - Runs Qwen paraphrasing only for Subtask 1, using resume/cache:
#         - `cache/qwen_aug_progress/{LANG}/subtask1/paraphrases.csv`
#         - `cache/qwen_aug_progress/{LANG}/subtask1/progress.json`
#    - Then builds Subtask 2 & 3 augmented train CSVs by:
#         * Reading augmented Subtask 1 train CSV.
#         * Copying labels from original Subtask 2/3 train CSVs
#           based on `base_id`.
# 4. If you interrupt Subtask 1 augmentation, rerun this cell:
#    - Subtask 1 will resume from cache.
#    - Subtask 2 & 3 will be rebuilt from the updated T1_aug file.
# 5. Then in XLM-R / DeBERTa / ensemble notebooks, set:
#       `BASE = "../dev_phase_aug"`
#    so they train on augmented data.

print("=== Qwen data augmentation for LANG =", LANG, "===\n")

# 1) Qwen paraphrasing for Subtask 1 (resume-aware inside this function)
augment_subtask1_for_lang(
    lang=LANG,
    num_paraphrases=NUM_PARAPHRASES_T1,
    max_samples=MAX_SAMPLES_T1,
)

print("\n=== Building augmented Subtask 2 from Subtask 1 paraphrases ===")
build_augmented_subtask2_from_t1(lang=LANG)

print("\n=== Building augmented Subtask 3 from Subtask 1 paraphrases ===")
build_augmented_subtask3_from_t1(lang=LANG)

print("\n=== Done. Augmented TRAIN CSVs are in:", AUG_BASE, "===\n")


=== Qwen data augmentation for LANG = eng ===

[T1] Reading TRAIN from: ../dev_phase/subtask1/train/eng.csv
[T1] Original TRAIN size: 3222
[T1] Will augment 1175 positive examples with 2 paraphrases each.
[T1] Already completed examples (from cache): 1002


[T1] Augmenting: 100%|██████████| 1175/1175 [02:26<00:00,  8.01it/s] 


[Subtask 1] Rebuilt augmented TRAIN at ../dev_phase_aug/subtask1/train/eng.csv (size=5572)

=== Building augmented Subtask 2 from Subtask 1 paraphrases ===
[T2] Building augmented train from T1_aug.
     T1_aug: ../dev_phase_aug/subtask1/train/eng.csv
     T2_orig: ../dev_phase/subtask2/train/eng.csv
[T2] Saved augmented TRAIN to: ../dev_phase_aug/subtask2/train/eng.csv  (rows=5572)

=== Building augmented Subtask 3 from Subtask 1 paraphrases ===
[T3] Building augmented train from T1_aug.
     T1_aug: ../dev_phase_aug/subtask1/train/eng.csv
     T3_orig: ../dev_phase/subtask3/train/eng.csv
[T3] Saved augmented TRAIN to: ../dev_phase_aug/subtask3/train/eng.csv  (rows=5572)

=== Done. Augmented TRAIN CSVs are in: ../dev_phase_aug ===



## Copy dev set from dev_phase to dev_phase_aug

In [5]:
# ## Copy DEV splits to augmented folder
#
# We do NOT change the dev/validation data content, but we mirror the folder
# structure under `../dev_phase_aug` so that all downstream notebooks can use:
#
#     BASE = "../dev_phase_aug"
#
# for both train and dev.

import shutil

base_dir = Path(".")  # directory where the notebook lives

tasks = ["subtask1", "subtask2", "subtask3"]

for task in tasks:
    src = base_dir / ".." / "dev_phase"     / task / "dev" / f"{LANG}.csv"
    dst = base_dir / ".." / "dev_phase_aug" / task / "dev" / f"{LANG}.csv"

    # Create target directory if needed
    dst.parent.mkdir(parents=True, exist_ok=True)

    # Copy file (including metadata)
    shutil.copy2(src, dst)
    print(f"Copied {src} -> {dst}")


Copied ../dev_phase/subtask1/dev/eng.csv -> ../dev_phase_aug/subtask1/dev/eng.csv
Copied ../dev_phase/subtask2/dev/eng.csv -> ../dev_phase_aug/subtask2/dev/eng.csv
Copied ../dev_phase/subtask3/dev/eng.csv -> ../dev_phase_aug/subtask3/dev/eng.csv
