In [1]:
import s3prl
import torch
import numpy
from pydub import AudioSegment
import s3prl.hub as hub
import os
import torchaudio
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import random
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torchaudio.set_audio_backend("soundfile")

In [3]:
class config:
    seed = 2083
    epoch = 15
    learning_rate = 1e-4

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(config.seed)

## 检查训练数据声道

In [5]:
def view_channels(path : str):
    """ 检查path路径下音频文件的声道数"""
    if not os.path.exists(path):
        return -1, 0, 0
    files = os.listdir(path)
    files = [i for i in files if i.endswith('.mp3') or i.endswith('.wav')]
    print('file count', len(files))
    count_1, count_2 = 0, 0
    sample_rates = set()
    for file in files:
        file_path = os.path.join(path, file)
        wav, sample_rate = torchaudio.load(file_path)
        if wav.shape[0] == 1:
            count_1 += 1
        if wav.shape[0] == 2:
            count_2 += 1
        # if sample_rate == 8000:
            # print(file_path)
        if sample_rate not in sample_rates:
            sample_rates.add(sample_rate)

    print(f"{path}中共计{len(files)}个文件，其中单声道文件数目为{count_1}, 双声道文件数目为{count_2}, 采样率为{sample_rates}")
    return len(files), count_1, count_2

## 双声道转单声道, 统一采样率

In [6]:
filename = "MADReSS-23-train/train/adrso003.mp3"
wav, sample_rate = torchaudio.load(filename)
print(wav.shape, sample_rate)

torch.Size([2, 1480704]) 44100


In [7]:
def stereo_to_mono(source_path: str, output_path: str):
    """将双声道音频转换为单声道音频"""
    files = os.listdir(source_path)
    print('file count', len(files), files[0:3])
    files = [i for i in files if i.endswith('.mp3')]
    print('file count', len(files))
    for file in files:
        file_path = os.path.join(source_path, file)

        sound = AudioSegment.from_mp3(file_path)
        sound = sound.set_channels(1)
        sound = sound.set_frame_rate(16000)

        output = os.path.join(output_path, file)[:-3] + "wav"
        try:
            sound.export(output, format="wav")
        except FileNotFoundError:
            os.makedirs(output_path)
            sound.export(output, format="wav")
        # print(file_path, output)

In [9]:
# 转换训练数据
c, a, b = view_channels(path="./data/train")
if a != c:
    stereo_to_mono(source_path="./MADReSS-23-train/train", output_path="./data/train")
# 转换样例数据Spanish
c, a, b = view_channels(path="./data/sample")
if a != c:
    stereo_to_mono(source_path="./MADReSS-23-train/sample", output_path="./data/sample")
#
c, a, b = view_channels(path="./data/ivanova-ad")
if a != c:
    stereo_to_mono(source_path="./media.talkbank.org/dementia/Spanish/Ivanova/AD", output_path="./data/ivanova-ad")
#
c, a, b = view_channels(path="./data/ivanova-hc")
if a != c:
    stereo_to_mono(source_path="./media.talkbank.org/dementia/Spanish/Ivanova/HC", output_path="./data/ivanova-hc")
#
c, a, b = view_channels(path="./data/ivanova-mci")
if a != c:
    stereo_to_mono(source_path="./media.talkbank.org/dementia/Spanish/Ivanova/MCI", output_path="./data/ivanova-mci")

file count 237
./data/train中共计237个文件，其中单声道文件数目为237, 双声道文件数目为0, 采样率为{16000}
file count 8
./data/sample中共计8个文件，其中单声道文件数目为8, 双声道文件数目为0, 采样率为{16000}
file count 74
./data/ivanova-ad中共计74个文件，其中单声道文件数目为74, 双声道文件数目为0, 采样率为{16000}
file count 197
./data/ivanova-hc中共计197个文件，其中单声道文件数目为197, 双声道文件数目为0, 采样率为{16000}
file count 90
./data/ivanova-mci中共计90个文件，其中单声道文件数目为90, 双声道文件数目为0, 采样率为{16000}


## 检查对应的时长是否发生改变

In [11]:
def check_duration(path1 : str, path2 : str):
    """检查两个对应的文件时长是否一致"""
    files1 = os.listdir(path1)
    files2 = os.listdir(path2)
    files1 = [i for i in files1 if i.endswith('.mp3')]
    files2 = [i for i in files2 if i.endswith('.wav')]
    files1.sort()
    files2.sort()
    print(len(files1), len(files2))
    count = 0
    for fl1, fl2 in zip(files1, files2):
        file1, file2 = os.path.join(path1, fl1), os.path.join(path2, fl2)
        
        wav1, sample_rate1, wav2, sample_rate2 = None, None, None, None
        print(file1, file2)

        try:
            wav1, sample_rate1 = torchaudio.load(file1)
            wav2, sample_rate2 = torchaudio.load(file2)
        except:
            print('faile to load,', file1, file2)
            
        if wav1 is not None and wav2 is not None:
            rnd = 2
            length1, length2 = round(wav1.shape[1]/sample_rate1, rnd), round(wav2.shape[1]/sample_rate2, rnd)
            if length1 > 180 or length2 > 180:
                count += 1
            print(file1, file2, wav1.shape, wav2.shape, sample_rate1, sample_rate2, length1, length2)
            assert(abs(length1 - length2) < 0.1)

    print(count)


In [12]:
check_duration(path1="./MADReSS-23-train/train", path2="./data/train")
# 可能因为格式转换的原因，导致时长略微有些变化
check_duration(path1="./MADReSS-23-train/sample", path2="./data/sample")
#check_duration(path1="./media.talkbank.org/dementia/Spanish/Ivanova/AD", path2="./data/ivanova-ad")
#check_duration(path1="./media.talkbank.org/dementia/Spanish/Ivanova/HC", path2="./data/ivanova-hc")
#check_duration(path1="./media.talkbank.org/dementia/Spanish/Ivanova/MCI", path2="./data/ivanova-mci")

237 237
./MADReSS-23-train/train/adrso002.mp3 ./data/train/adrso002.wav
./MADReSS-23-train/train/adrso002.mp3 ./data/train/adrso002.wav torch.Size([2, 3440640]) torch.Size([1, 1248305]) 44100 16000 78.02 78.02
./MADReSS-23-train/train/adrso003.mp3 ./data/train/adrso003.wav
./MADReSS-23-train/train/adrso003.mp3 ./data/train/adrso003.wav torch.Size([2, 1480704]) torch.Size([1, 537217]) 44100 16000 33.58 33.58
./MADReSS-23-train/train/adrso004.mp3 ./data/train/adrso004.wav
./MADReSS-23-train/train/adrso004.mp3 ./data/train/adrso004.wav torch.Size([2, 983040]) torch.Size([1, 356659]) 44100 16000 22.29 22.29
./MADReSS-23-train/train/adrso005.mp3 ./data/train/adrso005.wav
./MADReSS-23-train/train/adrso005.mp3 ./data/train/adrso005.wav torch.Size([2, 2721792]) torch.Size([1, 987498]) 44100 16000 61.72 61.72
./MADReSS-23-train/train/adrso006.mp3 ./data/train/adrso006.wav
./MADReSS-23-train/train/adrso006.mp3 ./data/train/adrso006.wav torch.Size([2, 1801216]) torch.Size([1, 653503]) 44100 16000

In [23]:
model_0 = getattr(hub, "hubert_large_ll60k")()
# model_0 = getattr(hub, "hubert")()
model_0.cuda()

UpstreamExpert(
  (model): HubertModel(
    (feature_extractor): ConvFeatureExtractionModel(
      (conv_layers): ModuleList(
        (0): Sequential(
          (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): Sequential(
            (0): TransposeLast()
            (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (2): TransposeLast()
          )
          (3): GELU(approximate='none')
        )
        (1): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): Sequential(
            (0): TransposeLast()
            (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (2): TransposeLast()
          )
          (3): GELU(approximate='none')
        )
        (2): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=

In [24]:
def get_model_features(signal, model):
    hidden = np.zeros((1,1,1024))
    with torch.no_grad():
        output = model(signal.cuda())
        hidden = output["last_hidden_state"]
    hidden = hidden.cpu().numpy()
    hidden = hidden.flatten()
    return hidden


def get_features(wav, model, stride):
    length = wav.shape[1]
    n = int(length / stride)
    print('length={}, n={}.'.format(length, n))
    data = []
    for i in range(n):
        wavi = wav[:, i*stride:(i+1)*stride]
        hidden = get_model_features(wavi, model)
        data.append(hidden)
    data = np.concatenate(data)
    print('data shape', data.shape)
    return data


def save_model_features(path, model, stride=400):
    if not os.path.exists(path):
        return -1, 0, 0
    files = os.listdir(path)
    files = [os.path.join(path, i) for i in files]
    files = [i for i in files if i.endswith('.wav')]
    files.sort()
    print('file count', len(files))
    
    for file_path in files:
        print(file_path)
        wav, sample_rate = torchaudio.load(file_path)
        data = get_features(wav, model, stride)
        saveas = file_path.replace('.wav', '')
        np.savez_compressed(saveas, data=data)

#for path in ["./data/train", "./data/sample"]:
#    df_trian = save_model_features(path, model_0)

for path in ["./data/ivanova-ad", "./data/ivanova-hc", "./data/ivanova-mci"]:
    df_trian = save_model_features(path, model_0)

file count 361
./data/ivanova-ad/AD-M-57-163.wav
length=1410612, n=3526.
data shape (3610624,)
./data/ivanova-ad/AD-M-65-40.wav
length=662465, n=1656.
data shape (1695744,)
./data/ivanova-ad/AD-M-67-285.wav
length=589090, n=1472.
data shape (1507328,)
./data/ivanova-ad/AD-M-69-272.wav
length=583155, n=1457.
data shape (1491968,)
./data/ivanova-ad/AD-M-72-314.wav
length=1258250, n=3145.
data shape (3220480,)
./data/ivanova-ad/AD-M-72-44.wav
length=591830, n=1479.
data shape (1514496,)
./data/ivanova-ad/AD-M-76-194.wav
length=838844, n=2097.
data shape (2147328,)
./data/ivanova-ad/AD-M-76-42.wav
length=623595, n=1558.
data shape (1595392,)
./data/ivanova-ad/AD-M-77-270.wav
length=494404, n=1236.
data shape (1265664,)
./data/ivanova-ad/AD-M-78-155.wav
length=1135177, n=2837.
data shape (2905088,)
./data/ivanova-ad/AD-M-78-24.wav
length=1345411, n=3363.
data shape (3443712,)
./data/ivanova-ad/AD-M-79-13.wav
length=973845, n=2434.
data shape (2492416,)
./data/ivanova-ad/AD-M-79-160.wav
leng