In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, DatasetDict, concatenate_datasets, Audio


## CREMA-D

In [56]:
def process_crema_d(file_path):
    parts = os.path.basename(file_path).split('_')
    actor_id = parts[0]
    sentence = parts[1]
    emotion = parts[2]
    intensity = parts[3].split('.')[0]
    if intensity == 'XX':
        intensity = 'UNK'
    
    female_ids = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,
                1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,1052,1053,1054,
                1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,
                1082,1084,1089,1091]
    gender = 'F' if int(actor_id) in female_ids else 'M'
    
    text_map = {
        'DFA': "Don't Forget A jacket",
        'IEO': "It's Eleven O' Clock",
        'IOM': "I'm On My way to the meeting"
    }
    
    return {
        'file_path': file_path,
        'emotion': emotion,
        'emotion_intensity': intensity,
        'gender': gender,
        'age_group': 'AD',
        'dataset': 'CREMA-D',
        'speaker_id': actor_id,
        'text': text_map[sentence],
        'utterance_number': '01'
    }


## RAVDESS

In [57]:
def process_ravdess(file_path):
    parts = os.path.basename(file_path).split('-')
    emotion_map = {'01': 'NEU', '02': 'CAL', '03': 'HAP', '04': 'SAD', '05': 'ANG', '06': 'FEA', '07': 'DIS', '08': 'SUR'}
    intensity_map = {'01': 'MD', '02': 'HI'}
    
    emotion = emotion_map[parts[2]]
    intensity = intensity_map[parts[3]]
    sentence = 'KTD' if parts[4] == '01' else 'DSD'
    actor_id = parts[6].split('.')[0]
    gender = 'M' if int(actor_id) % 2 == 1 else 'F'
    
    text_map = {
        'KTD': "Kids are talking by the door",
        'DSD': "Dogs are sitting by the door",
    }
    
    return {
        'file_path': file_path,
        'emotion': emotion,
        'emotion_intensity': intensity,
        'gender': gender,
        'age_group': 'AD',
        'dataset': 'RAVDESS',
        'speaker_id': actor_id,
        'text': text_map[sentence],
        'utterance_number': parts[5]
    }


## SAVEE

In [48]:
def process_savee(file_path):
    parts = os.path.basename(file_path).split('_')
    emotion_map = {'a': 'ANG', 'd': 'DIS', 'f': 'FEA', 'h': 'HAP', 'n': 'NEU', 'sa': 'SAD', 'su': 'SUR'}

    speaker_id = parts[0]
    emotion = emotion_map[parts[1][0:2] if parts[1].startswith('sa') or parts[1].startswith('su') else parts[1][0]]
    utterance_number = parts[1][-2:]
    
    return {
        'file_path': file_path,
        'emotion': emotion,
        'emotion_intensity': 'UNK',
        'gender': 'M',
        'age_group': 'AD',
        'dataset': 'SAVEE',
        'speaker_id': speaker_id,
        'text': '',
        'utterance_number': utterance_number
    }


## TESS

In [5]:
def process_tess(file_path):
    parts = os.path.basename(file_path).split('_')
    emotion_map = {'ang': 'ANG', 'angry': 'ANG', 'dis': 'DIS', 'disgust': 'DIS', 'fea': 'FEA', 'fear': 'FEA', 'hap': 'HAP', 'happy': 'HAP', 'neu': 'NEU', 'neutral': 'NEU', 'ps': 'SUR', 'sad': 'SAD'}
    
    speaker_id = parts[0]
    word = parts[1]
    emotion = emotion_map[parts[2].split('.')[0]]
    age_group = 'SE' if speaker_id == 'OAF' else 'AD'
    
    return {
        'file_path': file_path,
        'emotion': emotion,
        'emotion_intensity': 'UNK',
        'gender': 'F',
        'age_group': age_group,
        'dataset': 'TESS',
        'speaker_id': speaker_id,
        'text': f"Say the word {word}",
        'utterance_number': '01'
    }


## ESD-HLT_Labs

In [None]:
def process_esd_hlt_labs(file_path):
    
    return {
        'file_path': file_path,
        'emotion': ,
        'emotion_intensity': ,
        'gender': ,
        'age_group': ,
        'dataset': 'ESD_HLT_LABS',
        'speaker_id': ,
        'text': ,
        'utterance_number':
    }


## Main function


In [6]:
# def resample_audio(input_file, output_file, target_sr=16000):
#     """Resample audio file to 16kHz using ffmpeg"""
#     cmd = [
#         'ffmpeg',
#         '-i', input_file,
#         '-ar', str(target_sr),
#         '-ac', '1',
#         '-y', output_file
#     ]
#     try:
#         result = subprocess.run(cmd, check=True, capture_output=True, text=True)
#         return True
#     except subprocess.CalledProcessError as e:
#         print(f"Error resampling {input_file}:")
#         print(f"Command: {' '.join(cmd)}")
#         print(f"Return code: {e.returncode}")
#         print(f"stdout: {e.stdout}")
#         print(f"stderr: {e.stderr}")
#         return False


In [7]:
# def process_and_copy_audio(input_file, output_dir, metadata):
#     """Process audio file and copy to output directory with new naming convention"""
#     new_filename = f"{metadata['emotion']}_{metadata['emotion_intensity']}_{metadata['gender']}_{metadata['age_group']}_{metadata['dataset']}_{metadata['speaker_id']}_{metadata['utterance_type']}_{metadata['sentence'] or metadata['word']}_{metadata['utterance_number']}.wav"
#     output_file = os.path.join(output_dir, new_filename)
#     if resample_audio(input_file, output_file):
#         return output_file
#     else:
#         return None


In [10]:
def process_dataset(dataset_path, process_func):
    data = []
    for root, _, files in os.walk(dataset_path):
        for file in tqdm(files, desc=f"Processing {os.path.basename(root)}"):
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                data.append(process_func(file_path))
    return Dataset.from_pandas(pd.DataFrame(data))


In [11]:
BASE_PATH = '/mnt/nvme/DATASETS/audio_ds/emotion_recog/'
DATASET_NAME = 'ssi_emotion_recog'

OUTPUT_PATH = BASE_PATH + DATASET_NAME
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)
else:
    import shutil
    shutil.rmtree(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)

crema_d = process_dataset(os.path.join(BASE_PATH, 'CREMA-D', 'data'), process_crema_d)
ravdess = process_dataset(os.path.join(BASE_PATH, 'RAVDESS', 'data'), process_ravdess)
savee = process_dataset(os.path.join(BASE_PATH, 'SAVEE', 'data'), process_savee)
tess = process_dataset(os.path.join(BASE_PATH, 'TESS', 'data'), process_tess)
esd_hlt_labs = process_dataset(os.path.join(BASE_PATH, 'ESD-HLT_LABS', 'data'), process_esd_hlt_labs)


Processing data: 100%|██████████| 7442/7442 [00:00<00:00, 606591.99it/s]
Processing data: 0it [00:00, ?it/s]
Processing Actor_01: 100%|██████████| 60/60 [00:00<00:00, 227745.01it/s]
Processing Actor_02: 100%|██████████| 60/60 [00:00<00:00, 231943.08it/s]
Processing Actor_03: 100%|██████████| 60/60 [00:00<00:00, 408536.10it/s]
Processing Actor_04: 100%|██████████| 60/60 [00:00<00:00, 271769.16it/s]
Processing Actor_05: 100%|██████████| 60/60 [00:00<00:00, 413231.92it/s]
Processing Actor_06: 100%|██████████| 60/60 [00:00<00:00, 339162.05it/s]
Processing Actor_07: 100%|██████████| 60/60 [00:00<00:00, 258376.02it/s]
Processing Actor_08: 100%|██████████| 60/60 [00:00<00:00, 390167.81it/s]
Processing Actor_09: 100%|██████████| 60/60 [00:00<00:00, 411879.28it/s]
Processing Actor_10: 100%|██████████| 60/60 [00:00<00:00, 204766.67it/s]
Processing Actor_11: 100%|██████████| 60/60 [00:00<00:00, 278383.01it/s]
Processing Actor_12: 100%|██████████| 60/60 [00:00<00:00, 290934.38it/s]
Processing Acto

In [12]:
# Combine datasets
combined_dataset = concatenate_datasets([crema_d, ravdess, savee, tess])
combined_dataset


Dataset({
    features: ['file_path', 'emotion', 'emotion_intensity', 'gender', 'age_group', 'dataset', 'speaker_id', 'utterance_type', 'sentence', 'word', 'utterance_number'],
    num_rows: 12162
})

In [13]:
# Create train/validation/test splits
TRAIN_SAMPLES = 10_000
TEST_SAMPLES = 2_000
VALID_SAMPLES = 162

if TRAIN_SAMPLES + TEST_SAMPLES + VALID_SAMPLES != len(combined_dataset):
    raise ValueError('Train, test, and validation sample counts do not match the dataset size')

train_testvalid = combined_dataset.train_test_split(test_size=(TEST_SAMPLES + VALID_SAMPLES) / len(combined_dataset), seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=VALID_SAMPLES / (TEST_SAMPLES + VALID_SAMPLES), seed=42)

dataset_dict = DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})


In [14]:
# Add audio feature
dataset_dict = dataset_dict.cast_column("file_path", Audio(sampling_rate=16000))


In [15]:
# Save dataset
dataset_dict.save_to_disk(OUTPUT_PATH)


Saving the dataset (0/3 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1999 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/163 [00:00<?, ? examples/s]

In [43]:
# Visualize some random samples
import IPython.display as ipd
import random
from pprint import pprint

sample = random.choice(dataset_dict['train'])
audio = sample['file_path']['array']
sample.pop('file_path')
pprint(sample)
ipd.Audio(audio, rate=16000)


{'age_group': 'SE',
 'dataset': 'TESS',
 'emotion': 'ANG',
 'emotion_intensity': 'UNK',
 'gender': 'F',
 'sentence': '',
 'speaker_id': 'OAF',
 'utterance_number': '01',
 'utterance_type': 'WOR',
 'word': 'fit'}


In [None]:
# Create metadata
metadata = {
    'dataset_name': 'Combined Emotion Speech Dataset',
    'version': '1.0.0',
    'description': 'A combined dataset of CREMA-D, RAVDESS, SAVEE, and TESS emotion speech datasets.',
    'homepage': 'https://huggingface.co/datasets/stapesai/ssi-speech-emotion-recognition',
    'license': 'See individual dataset licenses',
    'citation': 'Please cite the original datasets',
    'original_datasets': ['CREMA-D', 'RAVDESS', 'SAVEE', 'TESS']
}

with open('metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)


In [50]:
# Login to Hugging Face
from huggingface_hub import login, auth_check
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [51]:
dataset_dict.push_to_hub(
    'stapesai/ssi-speech-emotion-recognition',
    commit_message=None,
    commit_description=None,
)


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/3334 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/stapesai/ssi-speech-emotion-recognition/commit/8b17864fcdfcc47fc0fa8aa1d8d448f479c22b7c', commit_message='Upload dataset', commit_description='', oid='8b17864fcdfcc47fc0fa8aa1d8d448f479c22b7c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/stapesai/ssi-speech-emotion-recognition', endpoint='https://huggingface.co', repo_type='dataset', repo_id='stapesai/ssi-speech-emotion-recognition'), pr_revision=None, pr_num=None)