# PEFT with LoRA using OpenAI Whisper-large-v2

```
第二周作业一:
1、使用完整的 YelpReviewFull 数据集训练，对比看 Acc 最高能到多少。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/fine-tune-quickstart.ipynb ）
2、加载本地保存的模型，进行评估和再训练更高的 F1 Score。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/fine-tune-QA.ipynb ）

第二周作业二: 
1、在“LoRA 低秩适配 OpenAI Whisper-Large-V2 语音识别任务”中，为中文语料的训练过程增加过程评估，观察 Train Loss 和 Validation Loss 变化。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_lora_whisper-large-v2.ipynb ）------> this notebook is for this task. 
2、在“LoRA 低秩适配 OpenAI Whisper-Large-V2 语音识别任务”中，当 LoRA 模型训练完成后，使用测试集进行完整的模型评估。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_lora_whisper-large-v2.ipynb ） ------> this notebook is 


```

In [1]:
# =============================================
# 1. IMPORTS
# =============================================

import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from datasets import load_dataset, DatasetDict, Audio
from transformers import (
    AutoFeatureExtractor,
    AutoTokenizer,
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
)

# =============================================
# 2. GLOBAL CONFIGS / CONSTANTS
# =============================================

MODEL_NAME_OR_PATH = "openai/whisper-large-v2"
MODEL_DIR_BASE = "models/whisper-large-v2-asr-int8"

LANGUAGE = "Chinese (China)"
LANGUAGE_ABBR = "zh-CN"
TASK = "transcribe"

DATASET_NAME = "mozilla-foundation/common_voice_11_0"

BATCH_SIZE = 64
NUM_EPOCHS = 1
LEARNING_RATE = 1e-3
LOGGING_STEPS = 1

# for faster process .map (e.g.: 8 cpus then use 2, too high might freez the system)
NUM_PROC = 2

# Small dataset sizes for quick iteration
SMALL_TRAIN_SIZE = 640
SMALL_VAL_SIZE = 320

# =============================================
# 3. LOAD DATASET
# =============================================

common_voice = DatasetDict({
    "train": load_dataset(DATASET_NAME, LANGUAGE_ABBR, split="train", trust_remote_code=True),
    "validation": load_dataset(DATASET_NAME, LANGUAGE_ABBR, split="validation", trust_remote_code=True),
})

# =============================================
# 4. PREPROCESS: REMOVE UNUSED COLUMNS, RESAMPLE AUDIO
# =============================================

columns_to_remove = [
    "accent", "age", "client_id", "down_votes", "gender",
    "locale", "path", "segment", "up_votes"
]

def preprocess_dataset(ds):
    ds = ds.remove_columns(columns_to_remove)
    ds = ds.cast_column("audio", Audio(sampling_rate=16000))
    return ds

common_voice = DatasetDict({
    "train": preprocess_dataset(common_voice["train"]),
    "validation": preprocess_dataset(common_voice["validation"]),
})

# =============================================
# 5. LOAD MODEL & PROCESSOR COMPONENTS
# =============================================

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME_OR_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH, language=LANGUAGE, task=TASK)
processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH, language=LANGUAGE, task=TASK)

# =============================================
# 6. PREPARE DATASET FUNCTION
# =============================================

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# =============================================
# 7. DEFINE DATA COLLATOR
# =============================================

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features):
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# =============================================
# 8. LOAD BASE MODEL (8-bit) & APPLY PEFT (LoRA)
# =============================================

def load_peft_model():
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        MODEL_NAME_OR_PATH,
        load_in_8bit=True,
        device_map="auto"
    )
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    # prepare model for int8 
    model = prepare_model_for_int8_training(model)
    
    lora_config = LoraConfig(
        r=4,
        lora_alpha=64,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
    )

    peft_model = get_peft_model(model, lora_config)
    peft_model.print_trainable_parameters()
    peft_model.config.use_cache = False
    return peft_model

# =============================================
# 9. TRAINING FUNCTION (Reusable)
# =============================================

def run_training_run(name: str, train_ds, eval_ds, output_dir_suffix: str = ""):
    print(f"🔁 Starting training run: {name}")
    model_dir = f"{MODEL_DIR_BASE}"
    
    peft_model = load_peft_model()

    training_args = Seq2SeqTrainingArguments(
        output_dir=model_dir,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        num_train_epochs=NUM_EPOCHS,
        evaluation_strategy="epoch",
        logging_steps=LOGGING_STEPS,
        remove_unused_columns=False,
        label_names=["labels"],
        generation_max_length=128
    )

    trainer = Seq2SeqTrainer(
        args=training_args,
        model=peft_model,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        data_collator=data_collator,
        tokenizer=processor.feature_extractor,
    )
    
    print(f"Training run '{name}' started...")
    trainer.train()
    trainer.save_model(model_dir)
    print(f"✅ Training run '{name}' completed and model saved to {model_dir}")
    peft_model.eval()
    print(f"started evaluating...")
    # Return final metrics (e.g., eval loss)
    metrics = trainer.evaluate()
    print(f"{name} - metrics: {metrics}")
    print(f"{name} - Final Eval Loss: {metrics.get('eval_loss', 'N/A'):.4f}")
    return metrics

# =============================================
# 10. DATASET VARIANTS (Optimized)
# =============================================

print(f"🔹 Tokenizing SMALL dataset: {SMALL_TRAIN_SIZE} train, {SMALL_VAL_SIZE} val")

# 1. Shuffle and select SMALL subsets from the ORIGINAL (raw) dataset first
small_train_raw = common_voice["train"].shuffle(seed=16).select(range(SMALL_TRAIN_SIZE))
small_val_raw = common_voice["validation"].shuffle(seed=16).select(range(SMALL_VAL_SIZE))

# 2. Tokenize ONLY the small subset (much faster!)
print(f"🚀 Tokenizing SMALL dataset with NUM_PROC={NUM_PROC}")
small_tokenized = DatasetDict({
    "train": small_train_raw.map(prepare_dataset, num_proc=NUM_PROC),
    "validation": small_val_raw.map(prepare_dataset, num_proc=NUM_PROC),
})

# Extract small train/val after tokenization
small_train = small_tokenized["train"]
small_val = small_tokenized["validation"]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


🔹 Tokenizing SMALL dataset: 640 train, 320 val
🚀 Tokenizing SMALL dataset with NUM_PROC=2


Map (num_proc=2):   0%|          | 0/640 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/320 [00:00<?, ? examples/s]

In [2]:
# =============================================
# 11. RUN the training
# =============================================
# Train on SMALL dataset
small_metrics = run_training_run(
    name="small",
    train_ds=small_train,
    eval_ds=small_val,
)
print(f"🔹 Small Dataset - Eval Loss: {small_metrics.get('eval_loss', 'N/A'):.4f}")


🔁 Starting training run: small




trainable params: 1,966,080 || all params: 1,545,271,040 || trainable%: 0.12723204856023188
Training run 'small' started...




Epoch,Training Loss,Validation Loss
1,0.9359,1.081708


✅ Training run 'small' completed and model saved to models/whisper-large-v2-asr-int8
started evaluating...


small - metrics: {'eval_loss': 1.0817080736160278, 'eval_runtime': 185.2453, 'eval_samples_per_second': 1.727, 'eval_steps_per_second': 0.027, 'epoch': 1.0}
small - Final Eval Loss: 1.0817
🔹 Small Dataset - Eval Loss: 1.0817


In [1]:
print(f"NOTE: you'll need to restart the kernel and run this cell or will got error: ValueError")
# =============================================
# 0. IMPORTS (All imports at the top — PEP 8 compliant)
# =============================================

from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoTokenizer,
    AutoProcessor,
    AutomaticSpeechRecognitionPipeline,
)
from peft import PeftConfig, PeftModel
import torch

# =============================================
# 1. CONFIGURATION / CONSTANTS
# =============================================

# --- Model & Language Settings ---
MODEL_DIR = "models/whisper-large-v2-asr-int8"  # path to the fine-tuned PEFT model
BASE_MODEL_NAME_OR_PATH = "openai/whisper-large-v2"  # original base Whisper model
LANGUAGE = "Chinese (China)"
LANGUAGE_ABBR = "zh-CN"         # language code used during training
LANGUAGE_DECODE = "chinese"     # used for forced decoder IDs
TASK = "transcribe"             # task type

# --- Test Audio File ---
TEST_AUDIO_PATH = "data/audio/test_zh.flac"

# =============================================
# 2. LOAD BASE MODEL (ORIGINAL WHISPER, 8-BIT)
# =============================================

# Load the base Whisper model (8-bit quantized)
base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    BASE_MODEL_NAME_OR_PATH,
    load_in_8bit=True,
    device_map="auto",
)

# Load tokenizer, processor, and feature extractor using the base model's original config
tokenizer_base = AutoTokenizer.from_pretrained(
    BASE_MODEL_NAME_OR_PATH,
    language=LANGUAGE_ABBR,
    task=TASK,
)
processor_base = AutoProcessor.from_pretrained(
    BASE_MODEL_NAME_OR_PATH,
    language=LANGUAGE_ABBR,
    task=TASK,
)
feature_extractor_base = processor_base.feature_extractor

# =============================================
# 3. LOAD PEFT MODEL (FINETUNED ON TOP OF BASE)
# =============================================

# Load the PEFT config to get the base model path (should match BASE_MODEL_NAME_OR_PATH)
peft_config = PeftConfig.from_pretrained(MODEL_DIR)

peft_model = PeftModel.from_pretrained(
    AutoModelForSpeechSeq2Seq.from_pretrained(
        peft_config.base_model_name_or_path,
        load_in_8bit=True,
        device_map="auto",
    ),
    MODEL_DIR,
)

# Load tokenizer, processor, and feature extractor (same as training)
tokenizer_peft = AutoTokenizer.from_pretrained(
    peft_config.base_model_name_or_path,
    language=LANGUAGE_ABBR,
    task=TASK,
)
processor_peft = AutoProcessor.from_pretrained(
    peft_config.base_model_name_or_path,
    language=LANGUAGE_ABBR,
    task=TASK,
)
feature_extractor_peft = processor_peft.feature_extractor

# =============================================
# 4. CREATE TWO ASR PIPELINES (Base vs PEFT)
# =============================================

# --- Base Model Pipeline ---
pipeline_base = AutomaticSpeechRecognitionPipeline(
    model=base_model,
    tokenizer=tokenizer_base,
    feature_extractor=feature_extractor_base,
)

# --- PEFT Model Pipeline ---
pipeline_peft = AutomaticSpeechRecognitionPipeline(
    model=peft_model,
    tokenizer=tokenizer_peft,
    feature_extractor=feature_extractor_peft,
)

# =============================================
# 5. (OPTIONAL) FORCED DECODER IDS (for Chinese)
# =============================================

forced_decoder_ids_base = processor_base.get_decoder_prompt_ids(language=LANGUAGE_DECODE, task=TASK)
forced_decoder_ids_peft = processor_peft.get_decoder_prompt_ids(language=LANGUAGE_DECODE, task=TASK)

# Note: Most of the time, the pipeline will use the correct tokenizer/language settings
# automatically. These are here for advanced manual control if needed.

# =============================================
# 6. RUN INFERENCE ON BOTH MODELS (Same Audio)
# =============================================

# --- Base Model Inference ---
with torch.cuda.amp.autocast():
    result_base = pipeline_base(TEST_AUDIO_PATH, max_new_tokens=255)
    transcription_base = result_base["text"]

# --- PEFT Model Inference ---
with torch.cuda.amp.autocast():
    result_peft = pipeline_peft(TEST_AUDIO_PATH, max_new_tokens=255)
    transcription_peft = result_peft["text"]

# =============================================
# 7. OUTPUT COMPARISON
# =============================================

print("=" * 50)
print("🔍 TRANSCRIPTION COMPARISON")
print("=" * 50)

print("🟢 [Base Model] Whisper (Original, 8-bit):")
print(transcription_base)

print("🔵 [PEFT Model] Fine-Tuned Model:")
print(transcription_peft)

print("=" * 50)

NOTE: you'll need to restart the kernel and run this cell or will got error: ValueError


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


🔍 TRANSCRIPTION COMPARISON
🟢 [Base Model] Whisper (Original, 8-bit):
 This is a test for the automatic voice recognition of the WhisperLine Large V2 model.
🔵 [PEFT Model] Fine-Tuned Model:
这是一段测试用于WhisperLarge V2模型的自动语音识别测试。
