In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
import tensorflow as tf
from pydub import AudioSegment

%matplotlib inline

## First we must pad our dataset to have audio clips of the same length for all of our samples.

In [5]:
# Before padding MFCC shape

audio_file_name = 'voice_data/Actor_01/03-01-01-01-01-01-01.wav'
y, sr = librosa.load(audio_file_name, sr=44100)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
mfcc.shape

(20, 285)

In [6]:
# Padding the audio files so shape is the same for all audio entries

voice_data_dir = "voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        in_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        out_dir = "uniform_" + in_dir
        audio = AudioSegment.silent(duration=5500)
        audio = audio.overlay(AudioSegment.from_wav(in_dir))
        audio.export(out_dir, format='wav')

In [7]:
# After padding MFCC shape

audio_file_name = 'uniform_voice_data/Actor_01/03-01-01-01-01-01-01.wav'
y, sr = librosa.load(audio_file_name, sr=44100)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
mfcc.shape

(20, 474)

## Next, generate MFCCs from the padded audioclips and store them in a numpy array

In [8]:
x = np.array([])
voice_data_dir = "uniform_voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        wav_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        y, sr = librosa.load(wav_dir, sr=44100)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        x = np.append(x, mfcc)

x.shape

(13651200,)

## Reorganize array to be proper dimensions
Expected # of data points: (# actors) \* (# clips) \* (mfcc shape x) \* (mfcc shape y) = 24 \* 60 \* 20 \* 474 = 13651200

In [14]:
num_actors = 24
num_wav = 60
num_mfcc_x = mfcc[0].size
num_mfcc_y = len(mfcc)

x = x.reshape(num_actors, num_wav, num_mfcc_x, num_mfcc_y)
x.shape

(24, 60, 474, 20)

## Export array to csv

In [13]:
np.save("audio_data", x)