In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip -q install -U "transformers>=4.44" datasets accelerate peft bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m126.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip uninstall -y bitsandbytes
!pip install -U --quiet bitsandbytes accelerate transformers

Found existing installation: bitsandbytes 0.48.2
Uninstalling bitsandbytes-0.48.2:
  Successfully uninstalled bitsandbytes-0.48.2


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_ID_BASE = "Qwen/Qwen2.5-1.5B-Instruct" 

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base_tok = AutoTokenizer.from_pretrained(MODEL_ID_BASE)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID_BASE,
    quantization_config=bnb,
    device_map="auto",
)

if base_tok.pad_token_id is None:
    base_tok.pad_token = base_tok.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [5]:
import torch

def generate_style_transfer_base(
    src_paragraph,
    source_style,
    target_style,
    temperature=0.7,
    top_p=0.9,
    max_new_tokens=220,
):
    messages = [
        {
            "role": "system",
            "content": "You are a careful literary rewriting assistant that follows instructions exactly."
        },
        {
            "role": "user",
            "content": (
                f"Rewrite the paragraph from **{source_style}** into **{target_style}** style.\n"
                f"Rules: preserve character names and all events; "
                f"change only tone, narration, and imagery.\n\n"
                f"Paragraph:\n{src_paragraph.strip()}"
            ),
        },
    ]

    prompt = base_tok.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = base_tok(prompt, return_tensors="pt").to(base_model.device)

    with torch.no_grad():
        output = base_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            eos_token_id=base_tok.eos_token_id,
            pad_token_id=base_tok.pad_token_id,
        )

    decoded = base_tok.decode(output[0], skip_special_tokens=True)
    return decoded


In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

INPUT_PATH = "/content/drive/MyDrive/external_genre_validation_400_with_targets.csv"
OUTPUT_PATH = "/content/drive/MyDrive/external_genre_validation_400_with_targets_qwen_base.csv"
SAVE_EVERY = 10

# 1. Load data 
if os.path.exists(OUTPUT_PATH):
    print(f"Loading existing file to resume: {OUTPUT_PATH}")
    df = pd.read_csv(OUTPUT_PATH)
else:
    print(f"Loading original eval file: {INPUT_PATH}")
    df = pd.read_csv(INPUT_PATH)
    if "output_qwen_base" not in df.columns:
        df["output_qwen_base"] = ""

print("Total rows:", len(df))

def is_done(val):
    if val is None:
        return False
    if isinstance(val, float) and np.isnan(val):
        return False
    return str(val).strip() != ""

num_generated = 0

for idx, row in tqdm(df.iterrows(), total=len(df)):
    # Skip if already has base output
    if is_done(row.get("output_qwen_base", "")):
        continue

    src_paragraph = str(row["raw_content"])
    source_style = str(row["source_style"])
    target_style = str(row["target_style"])

    try:
        rewritten = generate_style_transfer_base(
            src_paragraph=src_paragraph,
            source_style=source_style,
            target_style=target_style,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=220,
        )
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        df.at[idx, "output_qwen_base"] = f"[ERROR: {e}]"
        continue

    df.at[idx, "output_qwen_base"] = rewritten
    num_generated += 1

    if num_generated > 0 and num_generated % SAVE_EVERY == 0:
        df.to_csv(OUTPUT_PATH, index=False)
        print(f"Saved progress after {num_generated} new base generations.")

# Final save
df.to_csv(OUTPUT_PATH, index=False)
print(f"Done! Total new base generations this run: {num_generated}")
print(f"Saved to: {OUTPUT_PATH}")


Loading original eval file: /content/drive/MyDrive/external_genre_validation_400_with_targets.csv
Total rows: 400


  0%|          | 0/400 [00:00<?, ?it/s]

Saved progress after 10 new base generations.
Saved progress after 20 new base generations.
Saved progress after 30 new base generations.
Saved progress after 40 new base generations.
Saved progress after 50 new base generations.
Saved progress after 60 new base generations.
Saved progress after 70 new base generations.
Saved progress after 80 new base generations.
Saved progress after 90 new base generations.
Saved progress after 100 new base generations.
Saved progress after 110 new base generations.
Saved progress after 120 new base generations.
Saved progress after 130 new base generations.
Saved progress after 140 new base generations.
Saved progress after 150 new base generations.
Saved progress after 160 new base generations.
Saved progress after 170 new base generations.
Saved progress after 180 new base generations.
Saved progress after 190 new base generations.
Saved progress after 200 new base generations.
Saved progress after 210 new base generations.
Saved progress after 2

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

INPUT_PATH = "/content/drive/MyDrive/test_df_transfer_clean.csv"
OUTPUT_PATH = "/content/drive/MyDrive/test_eval_qwen_base.csv"
SAVE_EVERY = 10

# 1. Load data 
if os.path.exists(OUTPUT_PATH):
    print(f"Loading existing file to resume: {OUTPUT_PATH}")
    df = pd.read_csv(OUTPUT_PATH)
else:
    print(f"Loading original eval file: {INPUT_PATH}")
    df = pd.read_csv(INPUT_PATH)
    if "output_qwen_base" not in df.columns:
        df["output_qwen_base"] = ""

print("Total rows:", len(df))

def is_done(val):
    if val is None:
        return False
    if isinstance(val, float) and np.isnan(val):
        return False
    return str(val).strip() != ""

num_generated = 0

for idx, row in tqdm(df.iterrows(), total=len(df)):
    # Skip if already has base output
    if is_done(row.get("output_qwen_base", "")):
        continue

    src_paragraph = str(row["raw_content"])
    source_style = str(row["source_style"])
    target_style = str(row["target_style"])

    try:
        rewritten = generate_style_transfer_base(
            src_paragraph=src_paragraph,
            source_style=source_style,
            target_style=target_style,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=220,
        )
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        df.at[idx, "output_qwen_base"] = f"[ERROR: {e}]"
        continue

    df.at[idx, "output_qwen_base"] = rewritten
    num_generated += 1

    if num_generated > 0 and num_generated % SAVE_EVERY == 0:
        df.to_csv(OUTPUT_PATH, index=False)
        print(f"Saved progress after {num_generated} new base generations.")

# Final save
df.to_csv(OUTPUT_PATH, index=False)
print(f"Done! Total new base generations this run: {num_generated}")
print(f"Saved to: {OUTPUT_PATH}")


Loading original eval file: /content/drive/MyDrive/test_df_transfer_clean.csv
Total rows: 192


  0%|          | 0/192 [00:00<?, ?it/s]

Saved progress after 10 new base generations.
Saved progress after 20 new base generations.
Saved progress after 30 new base generations.
Saved progress after 40 new base generations.
Saved progress after 50 new base generations.
Saved progress after 60 new base generations.
Saved progress after 70 new base generations.
Saved progress after 80 new base generations.
Saved progress after 90 new base generations.
Saved progress after 100 new base generations.
Saved progress after 110 new base generations.
Saved progress after 120 new base generations.
Saved progress after 130 new base generations.
Saved progress after 140 new base generations.
Saved progress after 150 new base generations.
Saved progress after 160 new base generations.
Saved progress after 170 new base generations.
Saved progress after 180 new base generations.
Saved progress after 190 new base generations.
Done! Total new base generations this run: 192
Saved to: /content/drive/MyDrive/test_eval_qwen_base.csv
