# Download DAIC-WOZ Dataset

In [1]:
import pandas as pd
import requests
from io import StringIO
from bs4 import BeautifulSoup
import re
from lxml import etree
import time 
import random
import os

In [4]:
requests.packages.urllib3.disable_warnings() # to disable the warning


def create_soup(url):

    user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
                }
    resp = requests.get(url, headers=user_agent, verify=False)
    if resp.ok:
        return BeautifulSoup(resp.text,'html.parser')
    else:
        print('Error:',resp.status_code)
        return

In [13]:
# Get the download links for the DAIC-WOZ dataset
DAIC_WOZ_page_url = r'https://dcapswoz.ict.usc.edu/wwwdaicwoz/'
soup = create_soup(DAIC_WOZ_page_url)
raw_data_url = soup.find_all('a',href=re.compile(r'_P.zip'))
ids = [id.get('href') for id in raw_data_url]
# https://dcapswoz.ict.usc.edu/wwwdaicwoz/311_P.zip
urls = [DAIC_WOZ_page_url + id for id in ids]


In [None]:
# Download the DAIC-WOZ dataset
for url in urls:
    r = requests.get(url, stream=True)
    with open(f'.../data/raw/DAIC_WOZ/{url.split('/')[-1]}', 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    time.sleep(random.randint(1, 3)) # sleep for a while to avoid being blocked by the server
    print(f'{url.split("/")[-1]} downloaded')
print('All files downloaded')


Install with aiohttp.

In [None]:
# !pip install nest_asyncio
import os
import aiohttp
import asyncio
import nest_asyncio

# 允许嵌套事件循环
nest_asyncio.apply()

async def download_file(session, url, file_path, semaphore):
    async with semaphore:
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=60*60*60)) as response:  # 增加超时时间
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
                with open(file_path, 'wb') as f:
                    while True:
                        chunk = await response.content.read(8192)
                        if not chunk:
                            break
                        f.write(chunk)
        except asyncio.TimeoutError:
            print(f"Timeout error for URL: {url}")

async def main(urls):
    semaphore = asyncio.Semaphore(max_concurrency)  # 设置最大并发数
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            file_path = f'../../data/raw/DAIC_WOZ/{url.split("/")[-1]}'
            tasks.append(download_file(session, url, file_path, semaphore))
        await asyncio.gather(*tasks)

max_concurrency = 5  # 最大并发数

# 在 Jupyter Notebook 中运行异步函数
await main(urls)

# Preprocessing

## Unzip the dataset and keep necessary files

In [None]:
# Unzip the downloaded zip files and only keep the .wav and TRANSCRIPT.csv files
import zipfile
import os
import shutil

# Define the directory where the zip files are stored
zip_dir = '../../data/raw/DAIC_WOZ/'

# Define the directory where the unzipped files will be stored
unzip_dir = '../../data/raw/DAIC_WOZ_unzipped/'

# Delete the directory if it already exists
shutil.rmtree(unzip_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(unzip_dir, exist_ok=True)

# Unzip the files
for file in os.listdir(zip_dir):
    if file.endswith('.zip'):
        with zipfile.ZipFile(zip_dir + file, 'r') as zip_ref:
            zip_ref.extractall(unzip_dir)


# Define the directory where the .wav and TRANSCRIPT.csv files will be stored
final_dir = '../../data/raw/DAIC_WOZ_final/'

# Delete the directory if it already exists
shutil.rmtree(final_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(final_dir, exist_ok=True)

# Move the .wav and TRANSCRIPT.csv files to the final directory
for root, dirs, files in os.walk(unzip_dir):
    for file in files:
        if file.endswith('.wav') or file.endswith('TRANSCRIPT.csv'):
            shutil.move(os.path.join(root, file), final_dir)
            

## Slice the audio data and match with transcript

Please download ffmpeg software and finish enviroment configuration.

In [42]:
import os
import pandas as pd
# Define the directory where the sliced .wav and .txt files are stored
sliced_dir = '../../data/raw/DAIC_WOZ_sliced/'

# Delete the directory if it already exists
shutil.rmtree(sliced_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(sliced_dir, exist_ok=True)

# List the files in the directory
os.listdir(final_dir)


# If need, delete this file '._487_TRANSCRIPT.csv'
try:
    os.remove('../../data/raw/DAIC_WOZ_final/._487_TRANSCRIPT.csv')
except FileNotFoundError:
    pass


# Load the TRANSCRIPT.csv file, which contains the transcriptions of the audio files
for file in os.listdir(final_dir):
    if file.endswith('TRANSCRIPT.csv'):
        # Define the audio file
        audio_file = file.replace('TRANSCRIPT.csv', 'AUDIO.wav')
        # Standardize audio files to a 16kHz sampling rate
        os.system(f'ffmpeg -i {final_dir + audio_file} -ar 16000 {final_dir + "16k" + audio_file}')
        # Load the TRANSCRIPT.csv file
        transcript = pd.read_csv(final_dir + file, sep='\t', header='infer')
        # Set column names
        transcript.columns = ['start_time', 'stop_time', 'role', 'text']
        # Keep only the 'Participant' role
        transcript = transcript[transcript['role'] == 'Participant']
        # Keep only rows where the text is not empty
        transcript = transcript[transcript['text'].notnull()]
        # filter only start_time-stop_time >= 3
        transcript = transcript[transcript['stop_time'] - transcript['start_time'] >= 3]
        # Reset the index
        transcript.reset_index(drop=True, inplace=True)

        # For every row, save the text to a .txt file and slice the corresponding audio file
        for i in range(len(transcript)):            
            # Save the text to a .txt file
            with open(sliced_dir + f'{file.split("_")[0]}_{i}.txt', 'w') as f:
                f.write(transcript['text'][i])
            # Slice the audio file
            start_time = transcript['start_time'][i]
            stop_time = transcript['stop_time'][i]
            # Convert the start and stop times to the correct format: HH:MM:SS
            start_time = f'{int(start_time/3600):02d}:{int((start_time%3600)/60):02d}:{int(start_time%60):02d}'
            stop_time = f'{int(stop_time/3600):02d}:{int((stop_time%3600)/60):02d}:{int(stop_time%60):02d}'
            # Slice the audio file
            os.system(f'ffmpeg -i {final_dir + "16k" + audio_file} -ss {start_time} -to {stop_time} {sliced_dir}{file.split("_")[0]}_{i}.wav')
        

## Audio and text data cleaning and augmentation

In [63]:
import re
def text_data_cleaning(file_path):
    # Read the text file, remove special characters and digits, convert the text to lowercase, and return the cleaned text to the text file
    with open(file_path, 'r') as f:
        text = f.read()
        # Remove special characters 
        text = re.sub(r'[^a-zA-Z\s\d]', '', text).strip()
        # No need to remove stopwords
        # Convert the text to lowercase
        text = text.lower()
    # Write the cleaned text to the text file
    with open(file_path, 'w') as f:
        f.write(text)
    return



In [None]:
for file in os.listdir(sliced_dir):
    if file.endswith('.txt'):
        text_data_cleaning(sliced_dir + file)

In [44]:
# !pip install -q noisereduce
# !pip install torchaudio
# !pip install -q librosa

In [61]:
import librosa
import numpy as np
import soundfile as sf
import noisereduce as nr
import random
import torch
import torchaudio
import torchaudio.transforms as T
import os

def clean_audio(input_path, output_path, target_sr=16000):
    """
    数据清洗：
    1. 采样率转换
    2. 响度归一化
    3. 降噪
    4. 静音移除
    """
    # 读取音频
    audio, sr = librosa.load(input_path, sr=None)
    
    # 采样率转换
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
    
    # 响度归一化（标准化到 [-1, 1]）
    audio = audio / np.max(np.abs(audio))

    # 降噪处理
    audio = nr.reduce_noise(y=audio, sr=target_sr)

    # 静音移除，如果全部为静音，则删除该音频
    non_silent_intervals = librosa.effects.split(audio, top_db=20)  # top_db 控制静音阈值
    try:
        audio = np.concatenate([audio[start:end] for start, end in non_silent_intervals])
    except ValueError:
        os.remove(input_path)
        print(f'{input_path} is removed because it is silent.')
        return

    # 保存处理后的音频
    sf.write(output_path, audio, target_sr)

In [46]:
sliced_dir = '../../data/raw/DAIC_WOZ_sliced/'
cleaned_dir = '../../data/raw/DAIC_WOZ_cleaned/'
# Delete the directory if it already exists
shutil.rmtree(cleaned_dir)
# Create the directory if it doesn't exist
os.makedirs(cleaned_dir, exist_ok=True)

for file in os.listdir(sliced_dir):
    if file.endswith('.wav'):
        clean_audio(sliced_dir + file, cleaned_dir + file)

  audio = audio / np.max(np.abs(audio))


../../data/raw/DAIC_WOZ_sliced/306_16.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/315_4.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/315_42.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/328_51.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/328_56.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/328_60.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/335_16.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/353_38.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/359_30.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/362_12.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/395_43.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/416_43.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sliced/421_51.wav is removed because it is silent.
../../data/raw/DAIC_WOZ_sl

In [62]:
def augment_audio(input_path, output_path, target_sr=16000):
    """
    数据增强：
    1. 时间拉伸 (Time Stretching)
    2. 音高变化 (Pitch Shifting)
    3. 噪声注入 (Noise Augmentation)
    4. 频谱增强 (SpecAugment)
    """
    # 读取音频
    audio, sr = librosa.load(input_path, sr=target_sr)

    # 1. 时间拉伸
    if random.random() > 0.5:
        rate = random.uniform(0.8, 1.2)  # 在 0.8x ~ 1.2x 之间变化
        audio = librosa.effects.time_stretch(audio, rate=rate)
    
    # 2. 音高变化
    if random.random() > 0.5:
        steps = random.randint(-2, 2)  # 随机上下变 2 个半音
        audio = librosa.effects.pitch_shift(audio, sr=target_sr, n_steps=steps)

    # 3. 噪声注入
    if random.random() > 0.5:
        noise = np.random.normal(0, 0.005, audio.shape)  # 添加高斯噪声
        audio = audio + noise
        audio = np.clip(audio, -1.0, 1.0)  # 防止超出 [-1, 1]

    # 4. SpecAugment（使用 torchaudio 实现）
    if random.random() > 0.5:
        audio_tensor = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)  # 转换为 PyTorch Tensor
        mel_spec = T.MelSpectrogram(sample_rate=target_sr)(audio_tensor)
        spec_aug = T.FrequencyMasking(freq_mask_param=30)(mel_spec)  # 频率屏蔽
        spec_aug = T.TimeMasking(time_mask_param=50)(spec_aug)  # 时间屏蔽
        audio = librosa.istft(librosa.db_to_amplitude(spec_aug.squeeze(0).numpy()))  # 逆变换回时域信号

    # 保存增强后的音频
    sf.write(output_path, audio, target_sr)

# 示例调用
# clean_audio("input.wav", "cleaned.wav")
# augment_audio("cleaned.wav", "augmented.wav")

In [None]:
DAIC_WOZ_augmented_dir = '../../data/raw/DAIC_WOZ_augmented/'

# Delete the directory if it already exists
shutil.rmtree(DAIC_WOZ_augmented_dir)
# Create the directory if it doesn't exist
os.makedirs(DAIC_WOZ_augmented_dir, exist_ok=True)

# Set the random seed
random.seed(42)

# Augment the audio files
for file in os.listdir('../../data/raw/DAIC_WOZ_cleaned/'):
    augment_audio(f'../../data/raw/DAIC_WOZ_cleaned/{file}', f'{DAIC_WOZ_augmented_dir}{file}')

  return ref * np.power(10.0, S_db * 0.1)
  ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2)
  ytmp = ifft_window * fft.irfft(stft_matrix[..., :start_frame], n=n_fft, axis=-2)


In [None]:
# Move the augmented audio files and their corresponding text files to the processed directory
processed_audio_dir = '../../data/processed/DAIC_WOZ/audio/'
processed_text_dir = '../../data/processed/DAIC_WOZ/text/'
# Delete the directory if it already exists
shutil.rmtree(processed_audio_dir, ignore_errors=True)
shutil.rmtree(processed_text_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(processed_audio_dir, exist_ok=True)
os.makedirs(processed_text_dir, exist_ok=True)

for file in os.listdir(DAIC_WOZ_augmented_dir):
    if file.endswith('.wav'):
        shutil.move(DAIC_WOZ_augmented_dir + file, processed_audio_dir + file)
        # Move the corresponding text file
        try:
            shutil.move(sliced_dir + file.replace('.wav', '.txt'), processed_text_dir + file.replace('.wav', '.txt'))
        except FileNotFoundError:
            pass

In [79]:
# Split the data into training, validation, and test sets at a ratio of 80:10:10, and copy the files to the corresponding splits directorie
# e.g. data/splits/DAIC_WOZ/{split}/audio
import random
import shutil
import os

processed_audio_dir = '../../data/processed/DAIC_WOZ/audio/'
processed_text_dir = '../../data/processed/DAIC_WOZ/text/'


data_split_list = ['train', 'val', 'test']
for split in data_split_list:
    # Delete the directory if it already exists
    shutil.rmtree(f'../../data/splits/DAIC_WOZ/{split}/audio/', ignore_errors=True)
    shutil.rmtree(f'../../data/splits/DAIC_WOZ/{split}/text/', ignore_errors=True)
    # Create the directory if it doesn't exist
    os.makedirs(f'../../data/splits/DAIC_WOZ/{split}/audio/', exist_ok=True)
    os.makedirs(f'../../data/splits/DAIC_WOZ/{split}/text/', exist_ok=True)


# Set the random seed
random.seed(42)

# Randomly assign the files to the training, validation, and test sets, and copy the files to the corresponding directories
for file in os.listdir(processed_audio_dir):
    split = random.choices(data_split_list, weights=[0.8, 0.1, 0.1], k=1)[0]
    shutil.copy(processed_audio_dir + file, f'../../data/splits/DAIC_WOZ/{split}/audio/{file}')
    try:
        shutil.copy(processed_text_dir + file.replace('.wav', '.txt'), f'../../data/splits/DAIC_WOZ/{split}/text/{file.replace(".wav", ".txt")}')
    except FileNotFoundError:
        pass

In [None]:
# Remove the directories that are no longer needed
try:
    shutil.rmtree('../../data/raw/DAIC_WOZ_unzipped/')
    shutil.rmtree('../../data/raw/DAIC_WOZ_final/')
    shutil.rmtree('../../data/raw/DAIC_WOZ_sliced/')
    shutil.rmtree('../../data/raw/DAIC_WOZ_cleaned/')
    shutil.rmtree('../../data/raw/DAIC_WOZ_augmented/')
except FileNotFoundError:
    pass

# RAVDESS preprocessing

In [None]:
# Download the RAVDESS dataset
RAVDESS_url = 'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip?download=1'
r = requests.get(RAVDESS_url, stream=True)
with open(f'.../data/raw/Audio_Speech_Actors_01-24.zip', 'wb') as f:
    for chunk in r.iter_content(chunk_size=8192):
        if chunk:
            f.write(chunk)

SyntaxError: f-string: unmatched '(' (2858528579.py, line 4)

In [51]:
# Unzip the downloaded zip files and only keep the .wav and TRANSCRIPT.csv files
import zipfile
import os
import shutil

# Define the directory where the zip files are stored
zip_dir = '../../data/raw/'

# Define the directory where the unzipped files will be stored
unzip_dir = '../../data/raw/Audio_Speech_Actors_01-24/'

# Delete the directory if it already exists
shutil.rmtree(unzip_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(unzip_dir, exist_ok=True)

# Unzip the files
file = 'Audio_Speech_Actors_01-24.zip'
with zipfile.ZipFile(zip_dir + file, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)
# Delete the zip file
# os.remove(zip_dir + file)

In [52]:
# Define the directory where the raw files are stored
RAVDESS_raw_dir = '../../data/raw/RAVDESS/'

# Delete the directory if it already exists
shutil.rmtree(RAVDESS_raw_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(RAVDESS_raw_dir, exist_ok=True)

# For all the folders in the unzipped directory, move the .wav files to the RAVDESS_raw_dir
for folder in os.listdir(unzip_dir):
    for file in os.listdir(unzip_dir + folder):
        if file.endswith('.wav'):
            shutil.move(unzip_dir + folder + '/' + file, RAVDESS_raw_dir)

# Delete the unzipped directory
# shutil.rmtree(unzip_dir)

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
Vocal channel (01 = speech, 02 = song).
Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
Repetition (01 = 1st repetition, 02 = 2nd repetition).
Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [53]:
# Define the directory where the labeled files are stored
RAVDESS_labeled_dir = '../../data/raw/RAVDESS_labeled/'

# Create the directory if it doesn't exist
os.makedirs(RAVDESS_labeled_dir, exist_ok=True)

# Label the RAVDESS dataset
# Define the emotions: Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised)
emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

# For all the files in the RAVDESS_raw_dir, modify the name to be original name+label, and move the files to the RAVDESS_labeled_dir with the correct emotion label
for file in os.listdir(RAVDESS_raw_dir):
    # Get the emotion label
    emotion = emotions[int(file.split('-')[2]) - 1]
    # Modify the name
    new_name = file.split('.')[0] + '-' + emotion + '.wav'
    # Move the file
    shutil.move(RAVDESS_raw_dir + file, RAVDESS_labeled_dir + new_name)

In [72]:
# Audio cleaning and augmentation

RAVDESS_labeled_dir = '../../data/raw/RAVDESS_labeled/'

# Define the directory where the cleaned audio files will be stored
RAVDESS_cleaned_dir = '../../data/raw/RAVDESS_cleaned/'


# Delete the directory if it already exists
shutil.rmtree(RAVDESS_cleaned_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(RAVDESS_cleaned_dir, exist_ok=True)

# Clean the audio files
for file in os.listdir(RAVDESS_labeled_dir):
    clean_audio(RAVDESS_labeled_dir + file, RAVDESS_cleaned_dir + file)

# Define the directory where the augmented audio files will be stored
RAVDESS_augmented_dir = '../../data/raw/RAVDESS_augmented/'

# Delete the directory if it already exists
shutil.rmtree(RAVDESS_augmented_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(RAVDESS_augmented_dir, exist_ok=True)

# Augment the audio files
for file in os.listdir(RAVDESS_cleaned_dir):
    augment_audio(RAVDESS_cleaned_dir + file, RAVDESS_augmented_dir + file)

  return ref * np.power(10.0, S_db * 0.1)
  ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2)


In [73]:
# Move the augmented audio files to the processed directory
processed_audio_dir = '../../data/processed/RAVDESS/audio/'

# Delete the directory if it already exists
shutil.rmtree(processed_audio_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(processed_audio_dir, exist_ok=True)

for file in os.listdir(RAVDESS_augmented_dir):
    shutil.move(RAVDESS_augmented_dir + file, processed_audio_dir + file)

In [80]:
# Split the data into training, validation, and test sets at a ratio of 80:10:10, and copy the files to the corresponding splits directorie
# e.g. data/splits/RAVDESS/{split}/audio
import random
import shutil
import os

processed_audio_dir = '../../data/processed/RAVDESS/audio/'

data_split_list = ['train', 'val', 'test']
for split in data_split_list:
    # Delete the directory if it already exists
    shutil.rmtree(f'../../data/splits/RAVDESS/{split}/audio/', ignore_errors=True)
    # Create the directory if it doesn't exist
    os.makedirs(f'../../data/splits/RAVDESS/{split}/audio/', exist_ok=True)

# Set the random seed
random.seed(42)

# Randomly assign the files to the training, validation, and test sets, and copy the files to the corresponding directories
for file in os.listdir(processed_audio_dir):
    split = random.choices(data_split_list, weights=[0.8, 0.1, 0.1], k=1)[0]
    shutil.copy(processed_audio_dir + file, f'../../data/splits/RAVDESS/{split}/audio/{file}')

In [None]:
# Remove the directories that are no longer needed
try:
    shutil.rmtree('../../data/raw/RAVDESS/')
    shutil.rmtree('../../data/raw/RAVDESS_cleaned/')
    shutil.rmtree('../../data/raw/RAVDESS_augmented/')
    shutil.rmtree('../../data/raw/RAVDESS_labeled/')
    shutil.rmtree('../../data/raw/Audio_Speech_Actors_01-24/')
except FileNotFoundError:
    pass

# MELD preprocessing

In [None]:
# Download the MELD dataset
MELD_url = 'https://huggingface.co/datasets/declare-lab/MELD/resolve/main/MELD.Raw.tar.gz'
r = requests.get(MELD_url, stream=True)
with open(f'../../data/raw/MELD.Raw.tar.gz', 'wb') as f:
    for chunk in r.iter_content(chunk_size=8192):
        if chunk:
            f.write(chunk)

In [58]:
import tarfile

# 解压缩 .tar.gz 文件
def extract_tar_gz(file_path, extract_path):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=extract_path)

# 示例
file_path = '../../data/raw/MELD.Raw.tar.gz'
extract_path = '../../data/raw/MELD_raw/'

# Delete the directory if it already exists
shutil.rmtree(extract_path, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

extract_tar_gz(file_path, extract_path)

In [59]:
# Countinue to extract the .tar.gz files
split_list = ['train', 'dev', 'test']
for split in split_list:
    file_path = f'../../data/raw/MELD_raw/MELD.Raw/{split}.tar.gz'
    # Delete the directory if it already exists
    shutil.rmtree(f'../../data/raw/MELD_raw/MELD.Raw/{split}/', ignore_errors=True)
    # Create the directory if it doesn't exist
    os.makedirs(f'../../data/raw/MELD_raw/MELD.Raw/{split}/', exist_ok=True)
    extract_path = f'../../data/raw/MELD_raw/MELD.Raw/{split}/'
    extract_tar_gz(file_path, extract_path)

In [None]:
# Move the dev_sent_emo.csv and test_sent_emo.csv to corresponding directories
try:
    shutil.move(f'../../data/raw/MELD_raw/MELD.Raw/dev_sent_emo.csv', f'../../data/raw/MELD_raw/MELD.Raw/dev/dev_sent_emo.csv')
except FileNotFoundError:
    pass
try:
    shutil.move(f'../../data/raw/MELD_raw/MELD.Raw/test_sent_emo.csv', f'../../data/raw/MELD_raw/MELD.Raw/test/test_sent_emo.csv')
except FileNotFoundError:
    pass

# Change the folder name to the correct name, e.g. dev_splits_complete -> dev_splits, output_repeated_splits_test -> test_splits
try:
    os.rename('../../data/raw/MELD_raw/MELD.Raw/dev/dev_splits_complete', '../../data/raw/MELD_raw/MELD.Raw/dev/dev_splits')
    os.rename('../../data/raw/MELD_raw/MELD.Raw/test/output_repeated_splits_test', '../../data/raw/MELD_raw/MELD.Raw/test/test_splits')
except FileNotFoundError:
    pass

# Delete files not in the correct format, e.g. not starting with 'dia'

try:
    for root, dirs, files in os.walk('../../data/raw/MELD_raw/MELD.Raw/test/test_splits/'):
        for file in files:
            if file.startswith('dia') == False:
                os.remove(os.path.join(root, file))
except FileNotFoundError:
    pass



In [None]:
# Label the splitted files with information from the .csv files, and move the labeled audio files to the labeled directory
import pandas as pd
import os
import shutil

split_list = ['train', 'dev', 'test']

# Convert the formats of the start and stop times format: from HH:MM:SS to seconds
def convert_time_MELD(time_1, time_2):
    h, m, s = time_1.split(':')
    ms = time_2
    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

In [None]:
for split in split_list:
    # Define the directory where the raw files are stored
    MELD_raw_dir = f'../../data/raw/MELD_raw/MELD.Raw/{split}/{split}_splits/'

    # Define the directory where the labeled files are stored
    MELD_labeled_audio_dir = f'../../data/raw/MELD_labeled/{split}/audio/'
    MELD_labeled_text_dir = f'../../data/raw/MELD_labeled/{split}/text/'

    # Delete the directory if it already exists
    shutil.rmtree(MELD_labeled_audio_dir, ignore_errors=True)
    shutil.rmtree(MELD_labeled_text_dir, ignore_errors=True)

    # Create the directory if it doesn't exist
    os.makedirs(MELD_labeled_audio_dir, exist_ok=True)
    os.makedirs(MELD_labeled_text_dir, exist_ok=True)

    # Load the .csv file
    df = pd.read_csv(f'../../data/raw/MELD_raw/MELD.Raw/{split}/{split}_sent_emo.csv', header='infer')

    # For all the files in the MELD_raw_dir, modify the name to be original name+label, generate the transcripts txt file, and move the files to the MELD_labeled_dir with the correct emotion label
    for file in os.listdir(MELD_raw_dir):
        dia_num = file.split('_')[0].split('dia')[1]
        utt_num = file.split('_')[1].split('utt')[1].split('.')[0]
        # Filter if duration > 3s， if can't find the start_time, skip
        if len(df[(df['Dialogue_ID'] == int(dia_num)) & (df['Utterance_ID'] == int(utt_num))]) > 0:
            start_time = df[(df['Dialogue_ID'] == int(dia_num)) & (df['Utterance_ID'] == int(utt_num))]['StartTime'].values[0]
            end_time = df[(df['Dialogue_ID'] == int(dia_num)) & (df['Utterance_ID'] == int(utt_num))]['EndTime'].values[0]
            start_time_in_sec = convert_time_MELD( start_time.split(',')[0], start_time.split(',')[1])
            end_time_in_sec = convert_time_MELD( end_time.split(',')[0], end_time.split(',')[1])
            if end_time_in_sec - start_time_in_sec > 3:
                # Get the emotion label
                emotion = df[(df['Dialogue_ID'] == int(dia_num)) & (df['Utterance_ID'] == int(utt_num))]['Emotion'].values[0]
                # Modify the name
                audio_new_name = file.split('.')[0] + '_' + emotion + '.wav'
                text_new_name = file.split('.')[0] + '_' + emotion + '.txt'
                # retrieve the transcript
                transcript = df[(df['Dialogue_ID'] == int(dia_num)) & (df['Utterance_ID'] == int(utt_num))]['Utterance'].values[0]
                # Transfer the .mp4 file to .wav file and move the file
                os.system(f'ffmpeg -i {MELD_raw_dir + file} -ar 16000 {MELD_labeled_audio_dir + audio_new_name}')
                # Write the transcript to a text file
                with open(MELD_labeled_text_dir + text_new_name, 'w') as f:
                    f.write(transcript)




In [64]:
# Audio cleaning and augmentation
for split in split_list:
    # Define the directory where the cleaned audio files will be stored
    MELD_cleaned_dir = f'../../data/raw/MELD_cleaned/{split}/'

    # Delete the directory if it already exists
    shutil.rmtree(MELD_cleaned_dir, ignore_errors=True)
    # Create the directory if it doesn't exist
    os.makedirs(MELD_cleaned_dir, exist_ok=True)

    # Clean the audio files
    for file in os.listdir(f'../../data/raw/MELD_labeled/{split}/audio/'):
        clean_audio(f'../../data/raw/MELD_labeled/{split}/audio/{file}', f'{MELD_cleaned_dir}{file}')

    # Define the directory where the augmented audio files will be stored
    MELD_augmented_dir = f'../../data/raw/MELD_augmented/{split}/'

    # Delete the directory if it already exists
    shutil.rmtree(MELD_augmented_dir, ignore_errors=True)
    # Create the directory if it doesn't exist
    os.makedirs(MELD_augmented_dir, exist_ok=True)

    # Augment the audio files
    for file in os.listdir(MELD_cleaned_dir):
        augment_audio(MELD_cleaned_dir + file, MELD_augmented_dir + file)

  return ref * np.power(10.0, S_db * 0.1)
  ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2)
  ytmp = ifft_window * fft.irfft(stft_matrix[..., :start_frame], n=n_fft, axis=-2)


In [65]:
# text data cleaning
for split in split_list:
    for file in os.listdir(f'../../data/raw/MELD_labeled/{split}/text/'):
        text_data_cleaning(f'../../data/raw/MELD_labeled/{split}/text/{file}')

In [67]:
# Move the audio and text files to the processed data directory
for split in split_list:
    processed_audio_dir = f'../../data/processed/MELD/{split}/audio/'
    processed_text_dir = f'../../data/processed/MELD/{split}/text/'
    # Delete the directory if it already exists
    shutil.rmtree(processed_audio_dir, ignore_errors=True)
    shutil.rmtree(processed_text_dir, ignore_errors=True)
    # Create the directory if it doesn't exist
    os.makedirs(processed_audio_dir, exist_ok=True)
    os.makedirs(processed_text_dir, exist_ok=True)
    for file in os.listdir(f'../../data/raw/MELD_augmented/{split}/'):
        if file.endswith('.wav'):
            shutil.move(f'../../data/raw/MELD_augmented/{split}/{file}', processed_audio_dir + file)
            # Move the corresponding text file
            try:
                shutil.move(f'../../data/raw/MELD_labeled/{split}/text/{file.replace(".wav", ".txt")}', processed_text_dir + file.replace('.wav', '.txt'))
            except FileNotFoundError:
                pass

In [74]:
# Rename the dev directory to val
os.rename('../../data/processed/MELD/dev', '../../data/processed/MELD/val')

In [75]:
# Copy the splitted MELD dataset to splits directory
shutil.copytree('../../data/processed/MELD', '../../data/splits/MELD')

'../../data/splits/MELD'

In [86]:
# Remove the directories that are no longer needed
try:
    shutil.rmtree('../../data/raw/MELD_raw/')
    shutil.rmtree('../../data/raw/MELD_labeled/')
    shutil.rmtree('../../data/raw/MELD_cleaned/')
    shutil.rmtree('../../data/raw/MELD_augmented/')
except FileNotFoundError:
    pass

## Generate transcripts for Some Audio files

In [54]:
!pip install openai-whisper

Defaulting to user installation because normal site-packages is not writeable
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
     ---------------------------------------- 0.0/800.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/800.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/800.5 kB ? eta -:--:--
     ------------- -------------------------- 262.1/800.5 kB ? eta -:--:--
     -------------------------------------- 800.5/800.5 kB 2.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting more-itertools (from openai-whisper)
  Downloading more_itertools-10.6.0-py3-none-any.whl.metadata (37 kB)
Collecting tiktoken 



In [55]:
# Generate transcripts for the RAVDESS dataset
# Still use the RAVDESS_labeled_dir as the directory where the transcripts are stored
# For all the files in the RAVDESS_labeled_dir, generate a transcript using ASR model such as Whisper-large-v3, and save it to a .txt file
import os
import torch
import whisper

# 设置输入和输出目录
RAVDESS_labeled_dir = '../../data/raw/RAVDESS_labeled/'
output_dir = '../../data/raw/RAVDESS_transcripts/'

# Delete the directory if it already exists
shutil.rmtree(output_dir, ignore_errors=True)
# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# 加载 Whisper-large-v3 模型
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("large-v3").to(device)

def transcribe_audio(audio_path):
    """ 使用 Whisper-large-v3 转录音频 """
    result = model.transcribe(audio_path)
    return result["text"]

# 遍历 RAVDESS 目录中的所有 WAV 文件
for filename in os.listdir(RAVDESS_labeled_dir):
    if filename.endswith(".wav"):
        audio_path = os.path.join(RAVDESS_labeled_dir, filename)
        transcript = transcribe_audio(audio_path)

        # 保存到 .txt 文件（与音频文件同名）
        txt_filename = os.path.splitext(filename)[0] + ".txt"
        txt_path = os.path.join(output_dir, txt_filename)

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(transcript)
            

100%|█████████████████████████████████████| 2.88G/2.88G [02:23<00:00, 21.5MiB/s]


KeyboardInterrupt: 