# 语音任务微调
- 下载数据集 Common Voice
- 调整数据集
- 下载模型
- 使用 peft 技术对模型 微调 （lora）
- 训练
-

In [5]:
from datasets import load_dataset,DatasetDict,Audio

from 微调.高效微调 import tokenizer_dataset

dataset_name = "mozilla-foundation/common_voice_11_0"
language_abbr = "zh-CN"
dataset_dict = DatasetDict()

dataset_dict["train"]  = load_dataset(dataset_name,language_abbr,split="train",trust_remote_code=True)
dataset_dict["validation"] = load_dataset(dataset_name,language_abbr,split="validation",trust_remote_code=True)


print(dataset_dict["train"][10])

## 移除多余的列
dataset_dict = dataset_dict.remove_columns([
    "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"
])

## 降采样

dataset_dict = dataset_dict.cast_column("audio", Audio(sampling_rate=16_000))


print(dataset_dict["train"][10])




{'client_id': '95368aab163e0387e4fd4991b4f2d8ccfbd4364bf656c860230501fd27dcedf087773e4695a6cf5de9c4f1d406d582283190d065cdfa36b0e2b060cffaca977e', 'path': '/home/gengzi/.cache/huggingface/datasets/downloads/extracted/4ac968b0af73b59a30074b20435d7c35bd883441b69599a338fd8f04b5d01129/zh-CN_train_0/common_voice_zh-CN_33211831.mp3', 'audio': {'path': '/home/gengzi/.cache/huggingface/datasets/downloads/extracted/4ac968b0af73b59a30074b20435d7c35bd883441b69599a338fd8f04b5d01129/zh-CN_train_0/common_voice_zh-CN_33211831.mp3', 'array': array([2.84217094e-14, 2.98427949e-13, 3.69482223e-13, ...,
       1.02614513e-05, 9.40982409e-06, 4.53803295e-06]), 'sampling_rate': 48000}, 'sentence': '参与本电影制作的工作人员们担任了牵引之后日本动画界的角色。', 'up_votes': 2, 'down_votes': 0, 'age': '', 'gender': '', 'accent': '', 'locale': 'zh-CN', 'segment': ''}
{'audio': {'path': '/home/gengzi/.cache/huggingface/datasets/downloads/extracted/4ac968b0af73b59a30074b20435d7c35bd883441b69599a338fd8f04b5d01129/zh-CN_train_0/common_voice_zh-C

In [3]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor, AutoModel

model_name_or_path = "openai/whisper-large-v2"
# 从预训练模型加载特征提取器
featureExtractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,language=language_abbr)

# 从预训练模型加载处理器，处理器通常结合了特征提取器和分词器，为特定任务提供一站式的数据预处理
processor = AutoProcessor.from_pretrained(model_name_or_path,language=language_abbr)




In [None]:
## 数据预处理


- 数据预处理

In [14]:

def prepare_dataset(examples):
    audio = examples["audio"]
    examples["input_features"] = featureExtractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    examples["labels"] = tokenizer(examples["sentence"]).input_ids
    return examples



- 数据抽样

In [12]:

small_dataset = DatasetDict()
small_dataset["train"] =   dataset_dict["train"].shuffle(seed=11).select(range(500))
small_dataset["validation"] = dataset_dict["validation"].shuffle(seed=11).select(range(200))

print(small_dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 200
    })
})


In [15]:
tokenizer_dataset = small_dataset.map(prepare_dataset)

print(tokenizer_dataset)

Map: 100%|██████████| 500/500 [00:22<00:00, 22.57 examples/s] 
Map: 100%|██████████| 200/200 [00:09<00:00, 22.04 examples/s] 

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 200
    })
})





- 将数据集中的内容进行，填充和截断


In [16]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [17]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [18]:
from transformers import AutoModelForSpeechSeq2Seq
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path,load_in_4bit =True,device_map="auto")






The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [27]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

In [19]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []


- 准备高效微调的参数

In [20]:
from peft import LoraConfig,get_peft_model


config = LoraConfig(
     r=4,  # LoRA的秩，影响LoRA矩阵的大小
    lora_alpha=64,  # LoRA适应的比例因子
    # 指定将LoRA应用到的模型模块，通常是attention和全连接层的投影。
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,  # 在LoRA模块中使用的dropout率
    bias="none",  # 设置bias的使用方式，这里没有使用bias
)


In [21]:
peft_model =get_peft_model(model,config)

# 打印 LoRA 微调训练的模型参数
peft_model.print_trainable_parameters()

trainable params: 1,966,080 || all params: 1,545,271,040 || trainable%: 0.1272


- 添加训练参数

In [40]:
from transformers import Seq2SeqTrainingArguments

model_dir = "/mnt/e/models/peft/whisper"

trainingArguments = Seq2SeqTrainingArguments(
    output_dir=model_dir,  # 指定模型输出和保存的目录
    per_device_train_batch_size=20,  # 每个设备上的训练批量大小
    learning_rate=1e-3,  # 学习率
    num_train_epochs=10,  # 训练的总轮数
    evaluation_strategy="epoch",  # 设置评估策略，这里是在每个epoch结束时进行评估
    # warmup_steps=50,  # 在训练初期增加学习率的步数，有助于稳定训练
    # fp16=True,  # 启用混合精度训练，可以提高训练速度，同时减少内存使用
    per_device_eval_batch_size=10,  # 每个设备上的评估批量大小
    generation_max_length=128,  # 生成任务的最大长度
    logging_steps=10,  # 指定日志记录的步骤，用于跟踪训练进度
    remove_unused_columns=False,  # 是否删除不使用的列，以减少数据处理开销
    label_names=["labels"],  # 指定标签列的名称，用于训练过程中
    # evaluation_strategy="steps",
    # eval_steps=25,
)






In [41]:
from transformers import Seq2SeqTrainer


train= Seq2SeqTrainer(
    model=peft_model,
    args=trainingArguments,
    train_dataset=tokenizer_dataset["train"],
    eval_dataset=tokenizer_dataset["validation"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,

)
peft_model.config.use_cache = False

  train= Seq2SeqTrainer(


In [42]:
train.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [30]:
train.save_model(model_dir)

## 测试

In [45]:
from transformers import AutoModelForSpeechSeq2Seq,AutoTokenizer,AutoProcessor
from peft import LoraConfig,get_peft_model,PeftConfig,PeftModel
model_dir1 = "/mnt/e/models/peft/whisper"
peft_config = PeftConfig.from_pretrained(model_dir1)

basemodel = AutoModelForSpeechSeq2Seq.from_pretrained(peft_config.base_model_name_or_path,load_in_4bit =True,device_map="auto")

peft_model = PeftModel.from_pretrained(basemodel,model_dir1)

tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language_abbr)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language_abbr)
feature_extractor = processor.feature_extractor

text_audio = "/mnt/e/ruanjian/audio.wav"

from transformers import AutomaticSpeechRecognitionPipeline

pipeline = AutomaticSpeechRecognitionPipeline(model=peft_model, tokenizer=tokenizer, feature_extractor=feature_extractor)

forced_decoder_ids = processor.get_decoder_prompt_ids(language="chinese")




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Device set to use cuda:0


In [49]:
import torch
text_audio1 = "/mnt/e/ruanjian/audio.wav"
with torch.cuda.amp.autocast():
    text = pipeline(text_audio1, max_new_tokens=255)["text"]

  with torch.cuda.amp.autocast():
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [50]:
print(text)

你好大計畫!那是否答覆阿道夫?答覆大師!
