0 导入包

In [5]:
# 加载数据集与处理数据集
from datasets import load_dataset, DatasetDict
from datasets import Audio
import librosa

# 加载模型
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor




In [6]:
# 加载数据集
minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds = minds.select_columns([ "path","audio","english_transcription" ])
minds = minds.rename_columns({'english_transcription': 'sentence'})
minds

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 654
})

1 获取模型默认采样率

In [7]:
'''
Processor = tokenizer特征标记器 + feature_extractor特征放大器

使用方法
feature_extractor(x['audio']['array'],sampling_rate=x['audio']["sampling_rate"]).input_features[0]
输入array，返回方差放大的，并且均值还是接近0的数据。这样让特征更加明显。
返回input_features列

使用方法
tokenizer(x["sentence"]).input_ids
输入sentence文本语言，返回token。
返回input_ids列 和 attention_mask列
'''
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="chinese", task="transcribe")
sampling_rate = processor.feature_extractor.sampling_rate
print(sampling_rate)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


16000


2 重置采样率

In [8]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds[0]

{'path': 'C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\downloads\\extracted\\8b4d16f0fa6beceee204044b91fb9cfb987264e8f3b8f96a68f8f80c83c8ea3d\\en-AU~PAY_BILL\\response_4.wav',
 'audio': {'path': 'C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\downloads\\extracted\\8b4d16f0fa6beceee204044b91fb9cfb987264e8f3b8f96a68f8f80c83c8ea3d\\en-AU~PAY_BILL\\response_4.wav',
  'array': array([2.36120541e-05, 1.92325111e-04, 2.19284673e-04, ...,
         9.40908678e-04, 1.16613181e-03, 7.20883720e-04]),
  'sampling_rate': 16000},
 'sentence': 'I would like to pay my electricity bill using my card can you please assist'}

3 feature特征放大

In [9]:
minds = minds.map(
    lambda x : # for i in dataset , x=dataset[i]
    processor.feature_extractor(raw_speech=x['audio']["array"],sampling_rate=x['audio']["sampling_rate"],),
    num_proc=1
)
minds

Dataset({
    features: ['path', 'audio', 'sentence', 'input_features'],
    num_rows: 654
})

In [10]:
import numpy as np

# 平均值，与方差
sample = minds[0]
print(f"调整前Mean: {np.mean(sample['audio']['array']):.3}, 调整前Variance: {np.var(sample['audio']['array']):.3}")
print(f"调整后Mean: {np.mean(sample['input_features']):.3}, 调整后Variance: {np.var(sample['input_features']):.3}")

调整前Mean: 9.19e-06, 调整前Variance: 0.0133
调整后Mean: -0.491, 调整后Variance: 0.131


4 tokenizer文本标签化

In [11]:
minds = minds.map(
    lambda x : # for i in dataset , x=dataset[i]
    processor.tokenizer(text=x["sentence"],),
    num_proc=1
)
minds

Dataset({
    features: ['path', 'audio', 'sentence', 'input_features', 'input_ids', 'attention_mask'],
    num_rows: 654
})

5 processor同时把feature和tokenizer的事情做了

In [12]:
minds = minds.map(
    lambda x : # for i in dataset , x=dataset[i]
    processor(audio=x['audio']["array"],sampling_rate=x['audio']["sampling_rate"],text=x["sentence"],),
    num_proc=1
)
minds

Dataset({
    features: ['path', 'audio', 'sentence', 'input_features', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 654
})

6 feature的同时筛选时长

In [14]:
minds = minds.map(
    lambda x: 
    processor.feature_extractor(
        [i["array"] for i in x["audio"]], # 要处理的数据，
        sampling_rate=processor.feature_extractor.sampling_rate,
        max_length=int(processor.feature_extractor.sampling_rate * 30), # 最长为30s音频
        truncation=True,
        return_attention_mask=True,
    ),
    batched=True,
    batch_size=100,
    num_proc=1, # 进程数量
)
minds

Map:   0%|          | 0/654 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'audio', 'sentence', 'input_features', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 654
})

7 使用path列数据-留下少于20s的数据

In [15]:
Audio_length = [librosa.get_duration(path=x) for x in minds["path"]] # 获取每个样本时长 list格式
minds = minds.add_column("Audio_length", Audio_length)
minds = minds.filter(
    lambda x:
    x < 20.0,
    input_columns=["Audio_length"]
)
minds = minds.remove_columns(["Audio_length"])
minds

Filter:   0%|          | 0/654 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'audio', 'sentence', 'input_features', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 624
})

8 使用audio列数据-留下少于10s的数据

In [16]:
# 计算时长
minds = minds.map(
    lambda x : # for i in dataset , x=dataset[i]
    {"time" : len(x['audio']["array"]) / x['audio']["sampling_rate"]},
    num_proc=1
)

# 筛选
minds = minds.filter(
    lambda x: 
    x["time"] < 10.0, 
    num_proc=1
)
minds

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Filter:   0%|          | 0/624 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'audio', 'sentence', 'input_features', 'input_ids', 'attention_mask', 'labels', 'time'],
    num_rows: 479
})

9 使用gradio随机听取声音样本

In [None]:
import gradio as gr
with gr.Blocks() as demo: # 界面
    with gr.Column(): # 列布局
        dataset = minds # 指定数据集
        example = dataset.shuffle() # 打乱顺序
        for i in range(4):
            audio = (example[i]["audio"]["sampling_rate"],example[i]["audio"]["array"]) # (音频采样率，音频数据) ,  采样前4个
            # label = dataset.features["intent_class"].int2str(example[i]["intent_class"]) # 输入类别序号查看内容分类标签(类别序号)
            output = gr.Audio(audio, 
                            # label=label
                              )
            
demo.launch(debug=True)



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.




10 查看有标签的数据集

In [None]:
gtzan = load_dataset("marsyas/gtzan", "all") # 1000个30s歌曲片段，分10个类别的数据集

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [17]:
# 输入int数字查看对应标签
gtzan["train"].features["genre"].names

['blues',
 'classical',
 'country',
 'disco',
 'hiphop',
 'jazz',
 'metal',
 'pop',
 'reggae',
 'rock']

In [18]:
a = gtzan["train"].features["genre"].names
a[9]

'rock'