In [1]:
import s3prl
import torch
import numpy
from pydub import AudioSegment
import s3prl.hub as hub
import os
import torchaudio
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import random

In [2]:
class config:
    seed = 2083
    epoch = 15
    learning_rate = 1e-4

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(config.seed)

## 检查训练数据声道

In [4]:
def view_channels(path : str):
    """ 检查path路径下音频文件的声道数"""
    if not os.path.exists(path):
        return -1, 0, 0
    files = os.listdir(path)
    count_1, count_2 = 0, 0
    sample_rates = set()
    for file in files:
        file_path = os.path.join(path, file)
        wav, sample_rate = torchaudio.load(file_path)
        if wav.shape[0] == 1:
            count_1 += 1
        if wav.shape[0] == 2:
            count_2 += 1
        # if sample_rate == 8000:
            # print(file_path)
        if sample_rate not in sample_rates:
            sample_rates.add(sample_rate)

    print(f"{path}中共计{len(files)}个文件，其中单声道文件数目为{count_1}, 双声道文件数目为{count_2}, 采样率为{sample_rates}")
    return len(files), count_1, count_2

## 双声道转单声道, 统一采样率

In [5]:
def stereo_to_mono(source_path: str, output_path: str):
    """将双声道音频转换为单声道音频"""
    for file in os.listdir(source_path):
        file_path = os.path.join(source_path, file)

        sound = AudioSegment.from_mp3(file_path)
        sound = sound.set_channels(1)
        sound = sound.set_frame_rate(16000)

        output = os.path.join(output_path, file)[:-3] + "wav"
        try:
            sound.export(output, format="wav")
        except FileNotFoundError:
            os.makedirs(output_path)
            sound.export(output, format="wav")
        # print(file_path, output)

In [6]:
# 转换训练数据
c, a, b = view_channels(path="./Data/train")
if a != c:
    stereo_to_mono(source_path="./MADReSS-23-train/train", output_path="./Data/train")
# 转换样例数据
c, a, b = view_channels(path="./Data/sample")
if a!= c:
    stereo_to_mono(source_path="./MADReSS-23-train/sample", output_path="./Data/sample")

./Data/train中共计237个文件，其中单声道文件数目为237, 双声道文件数目为0, 采样率为{16000}
./Data/sample中共计8个文件，其中单声道文件数目为8, 双声道文件数目为0, 采样率为{16000}


## 检查对应的时长是否发生改变

In [5]:
def check_duration(path1 : str, path2 : str):
    """检查两个对应的文件时长是否一致"""
    files1 = os.listdir(path1)
    files2 = os.listdir(path2)
    count = 0
    for i in range(len(files1)):
        file1, file2 = os.path.join(path1, files1[i]), os.path.join(path2, files2[i])

        wav1, sample_rate1 = torchaudio.load(file1)
        wav2, sample_rate2 = torchaudio.load(file2)

        if (wav1.shape[1]/sample_rate1 > 180) or (wav2.shape[1]/sample_rate2) > 180:
            count += 1
        if wav1.shape[1]/sample_rate1 != wav2.shape[1]/sample_rate2:
            print(file1, file2, wav1.shape, wav2.shape, sample_rate1, sample_rate2, wav1.shape[1]/sample_rate1, wav2.shape[1]/sample_rate2)

    print(count)


In [6]:
check_duration(path1="./MADReSS-23-train/train", path2="./Data/train")
# 可能因为格式转换的原因，导致时长略微有些变化

./MADReSS-23-train/train\adrso002.mp3 ./Data/train\adrso002.wav torch.Size([2, 3440640]) torch.Size([1, 1248305]) 44100 16000 78.01904761904763 78.0190625
./MADReSS-23-train/train\adrso003.mp3 ./Data/train\adrso003.wav torch.Size([2, 1480704]) torch.Size([1, 537217]) 44100 16000 33.57605442176871 33.5760625
./MADReSS-23-train/train\adrso004.mp3 ./Data/train\adrso004.wav torch.Size([2, 983040]) torch.Size([1, 356659]) 44100 16000 22.291156462585032 22.2911875
./MADReSS-23-train/train\adrso005.mp3 ./Data/train\adrso005.wav torch.Size([2, 2721792]) torch.Size([1, 987498]) 44100 16000 61.718639455782316 61.718625
./MADReSS-23-train/train\adrso006.mp3 ./Data/train\adrso006.wav torch.Size([2, 1801216]) torch.Size([1, 653503]) 44100 16000 40.84390022675737 40.8439375
./MADReSS-23-train/train\adrso007.mp3 ./Data/train\adrso007.wav torch.Size([2, 1150976]) torch.Size([1, 417588]) 44100 16000 26.09922902494331 26.09925
./MADReSS-23-train/train\adrso008.mp3 ./Data/train\adrso008.wav torch.Size([2

In [21]:
model_0 = getattr(hub, "hubert_large_ll60k")()
# model_0 = getattr(hub, "hubert")()
model_0.cuda()

UpstreamExpert(
  (model): HubertModel(
    (feature_extractor): ConvFeatureExtractionModel(
      (conv_layers): ModuleList(
        (0): Sequential(
          (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): Sequential(
            (0): TransposeLast()
            (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (2): TransposeLast()
          )
          (3): GELU(approximate=none)
        )
        (1): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): Sequential(
            (0): TransposeLast()
            (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (2): TransposeLast()
          )
          (3): GELU(approximate=none)
        )
        (2): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0,

In [36]:
wav, sample = torchaudio.load("./Data/train/adrso056.wav")
wav = wav.cuda()
wav = wav[:, :400]
wav.shape, sample

(torch.Size([1, 400]), 16000)

In [37]:
# wavs = [torch.randn(160000, dtype=torch.float).to('cuda') for _ in range(16)]
with torch.no_grad():
    output = model_0(wav)["hidden_states"]

In [38]:
output[0].shape

torch.Size([1, 1, 1024])

In [39]:
16000 * 20 / 1000,   78 / 0.02

(320.0, 3900.0)