In [1]:
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import time

In [2]:
import json

def prepare_IEMOCAP_DS(json_path):
    wav_paths, emotions = [], []
    empty_cnt = 0
    
    f = open(json_path)
    lines = f.readlines()
    for l in lines:
        cur = eval(l)
        emotion = cur['emotion']
        wav_path = cur['key']
        
        if emotion == "xxx":
            continue
        
        # ang, hap, neu, sad
        if emotion == "ang":
            wav_paths.append(wav_path)
            emotions.append(0)
        elif emotion == "neu":
            wav_paths.append(wav_path)
            emotions.append(1)
        elif emotion == "sad":
            wav_paths.append(wav_path)
            emotions.append(2)
        elif emotion == "hap" or emotion == "exc":
            wav_paths.append(wav_path)
            emotions.append(3)
        
    return wav_paths, emotions, empty_cnt

In [3]:
wav_paths, emotions, empty_cnt = prepare_IEMOCAP_DS('iemocap_data_single_all_10.json')

In [4]:
len(wav_paths), len(emotions), empty_cnt

(5531, 5531, 0)

In [5]:
mean_signal_length = 100000

def get_feature(paths:str, mfcc_len:int=39, flatten:bool=False):
    features = []
    pad_cnt, non_pad_cnt = 0, 0
    
    for i, path, in tqdm(enumerate(paths), desc='get features.....'):
        signal, fs = librosa.load(path)
        s_len = len(signal)
        
        if s_len < mean_signal_length:
            pad_len = mean_signal_length - s_len
            pad_rem = pad_len % 2
            pad_len //= 2
            signal = np.pad(signal, (pad_len, pad_len+pad_rem), 'constant', constant_values=0)
            
            pad_cnt += 1
            
        else:
            pad_len = s_len - mean_signal_length
            pad_len //= 2
            signal = signal[pad_len:pad_len + mean_signal_length]
            
            non_pad_cnt += 1
            
        mfcc = librosa.feature.mfcc(y=signal, sr=fs, n_mfcc=39)
        mfcc = mfcc.T
        
        features.append(mfcc)
    
    return features, pad_cnt, non_pad_cnt

In [6]:
features, pad_cnt, non_pad_cnt = get_feature(wav_paths)

get features.....: 5531it [05:13, 17.66it/s]


In [7]:
pad_cnt, non_pad_cnt

(3509, 2022)

In [7]:
features[0].shape

(196, 39)

In [8]:
X = np.array(features)
y = np.array(emotions)

In [9]:
print(X.shape, y.shape)

with open('IEMOCAP.npy', 'wb') as f:
    np.save(f, X)
    np.save(f, y)

(5531, 196, 39) (5531,)


In [10]:
set_emotions = set(emotions)
list_res = (list(set_emotions))

for item in list_res:
    print(item)

0
1
2
3


In [11]:
from natsort import natsorted

list_res = natsorted(list_res)
print(list_res)

[0, 1, 2, 3]


In [12]:
from collections import Counter

Counter(emotions).keys() # equals to list(set(words))

dict_keys([1, 0, 2, 3])

In [13]:
Counter(emotions).values() # counts the elements' frequency

dict_values([1708, 1103, 1084, 1636])

In [14]:
# angry, excited, fear, sad, surprised, frustrated, happy, disappointed, neutral
# 1103, 