## Imports

In [1]:
import librosa
import pandas as pd
import numpy as np
import os

## Paths

In [2]:
paths = {"datasets": "../../../assets/audio_sentiment_data_v2/datasets", 
         "save_path": "../../../assets/audio_sentiment_data_v2/data_features"}

RAV = f"{paths['datasets']}/ravdess-emotional-speech-audio/"
SAVEE = f"{paths['datasets']}/surrey-audiovisual-expressed-emotion-savee/ALL/"
TESS = f"{paths['datasets']}/toronto-emotional-speech-set-tess/"

## Filtering the datasets based on the required emotions

### SAVEE Dataset

In [3]:
dir_list = os.listdir(SAVEE)
emotion=[]
path = []

sad = 0
fear = 0
happy = 0
surprise = 0

for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('angry')
        path.append(SAVEE + i)
    elif i[-8:-6]=='_f' and fear < 30:
        emotion.append('sad')
        path.append(SAVEE + i)
        fear += 1
    elif i[-8:-6]=='_h' and happy < 45:
        emotion.append('happy')
        path.append(SAVEE + i)
        happy += 1
    elif i[-8:-6]=='sa' and sad < 30:
        emotion.append('sad')
        path.append(SAVEE + i)
        sad += 1
    elif i[-8:-6]=='su' and surprise < 15:
        emotion.append('happy')
        path.append(SAVEE + i)
        surprise += 1
    elif i[-8:-6]=='_n':
        emotion.append('neutral')
        path.append(SAVEE + i)
    
# Now check out the label count distribution 
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()

neutral    120
happy       60
angry       60
sad         60
Name: labels, dtype: int64

### RAVDESS Dataset

In [4]:
dir_list = os.listdir(RAV)
dir_list.sort()

emotion = []
path = []

sad = 0
fear = 0
happy = 0
surprise = 0

for i in dir_list:
    fname = os.listdir(RAV + i)
    for f in fname:
        part = f.split('.')[0].split('-')
        if part[2] not in ['02','07']:
            if part[2] == '04' and sad < 96:
                emotion.append(int(part[2]))
                path.append(RAV + i + '/' + f)
                sad += 1
                
            elif part[2] == '06' and fear < 96:
                emotion.append(int(part[2]))
                path.append(RAV + i + '/' + f)
                fear += 1
                
            elif part[2] == '03' and happy < 144:
                emotion.append(int(part[2]))
                path.append(RAV + i + '/' + f)
                happy += 1
            
            elif part[2] == '08' and surprise < 48:
                emotion.append(int(part[2]))
                path.append(RAV + i + '/' + f)
                surprise += 1
                
            elif part[2] == '01':
                emotion.append(int(part[2]))
                path.append(RAV + i + '/' + f)
                
            elif part[2] == '05':
                emotion.append(int(part[2]))
                path.append(RAV + i + '/' + f)

RAV_df = pd.DataFrame(emotion)
RAV_df = RAV_df.replace({1: 'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'sad', 8:'happy'})
RAV_df.columns = ['emotion']
RAV_df['labels'] = RAV_df.emotion
RAV_df['source'] = 'RAVDESS'
RAV_df = pd.concat([RAV_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAV_df = RAV_df.drop(['emotion'], axis=1)
RAV_df.labels.value_counts()

angry      192
sad        192
happy      192
neutral     96
Name: labels, dtype: int64

### TESS Dataset

In [6]:
dir_list = os.listdir(TESS)
dir_list.sort()

path = []
emotion = []

sad = 0
fear = 0
happy = 0
surprise = 0

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('angry')
            path.append(TESS + i + "/" + f)
            
        elif (i == 'OAF_Fear' or i == 'YAF_fear') and fear < 200:
            emotion.append('sad')
            path.append(TESS + i + "/" + f)
            fear += 1
            
        elif (i == 'OAF_happy' or i == 'YAF_happy') and happy < 300:
            emotion.append('happy')
            path.append(TESS + i + "/" + f)
            happy += 1
            
        elif (i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised') and surprise < 100:
            emotion.append('happy')            
            path.append(TESS + i + "/" + f)
            surprise += 1
            
        elif (i == 'OAF_Sad' or i == 'YAF_sad') and sad < 200:
            emotion.append('sad')
            path.append(TESS + i + "/" + f)
            sad += 1
            
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('neutral')
            path.append(TESS + i + "/" + f)
        
TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()

neutral    400
sad        400
angry      400
happy      400
Name: labels, dtype: int64

## Saving the file paths

In [7]:
df = pd.concat([SAVEE_df, RAV_df, TESS_df], axis = 0)
print(df.labels.value_counts())
df.head()
df.to_csv(f"{paths['save_path']}/data_paths_final.csv",index=False)

sad        652
angry      652
happy      652
neutral    616
Name: labels, dtype: int64
