In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
import torch.nn.functional as F
from datasets import load_dataset
from huggingface_hub import login
login("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")

In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Adjust if using small variant
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer.pad_token = tokenizer.eos_token  # ✅ critical fix

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(base_model, lora_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("paradetox.tsv", sep="\t").dropna(subset=["toxic", "neutral1"])
df = df.rename(columns={"toxic": "prompt", "neutral1": "response"})
dataset = Dataset.from_pandas(df)

In [4]:
small_ds = dataset.select(range(50))
# 2. Tokenization function
def tokenize(example):
    prompt = f"Detoxify: {example['prompt']}\nResponse:"
    target = example["response"]
    return tokenizer(prompt, text_target=target, truncation=True, padding="max_length", max_length=256)


# 3. Apply it
tokenized_ds = small_ds.map(tokenize)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mistral-detox-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

In [15]:
from transformers import Trainer, DataCollatorForSeq2Seq

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True)
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
trainer.train()

Step,Training Loss
10,0.0


TrainOutput(global_step=10, training_loss=0.0, metrics={'train_runtime': 35.146, 'train_samples_per_second': 14.226, 'train_steps_per_second': 0.285, 'total_flos': 4283465692348416.0, 'train_loss': 0.0, 'epoch': 9.615384615384615})

In [17]:
def detoxify(model, tokenizer, toxic_sentence, max_new_tokens=50):
    prompt = f"Detoxify: {toxic_sentence}\nResponse:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return full_output.split("Response:")[-1].strip()

In [18]:
toxic = "You are such a dumb loser."
detoxified = detoxify(model, tokenizer, toxic)
print("→", detoxified)

→ 
