# Proccessing WAVs Into MFCC Data
Garrett Faucher

In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pydub import AudioSegment
import tensorflow as tf

%matplotlib inline

## First, pad the dataset to have audio clips of the same length for all samples

In [2]:
# Padding the audio files so shape is the same for all audio entries
voice_data_dir = "voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        in_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        out_dir = "uniform_" + in_dir
        audio = AudioSegment.silent(duration=5500)
        audio = audio.overlay(AudioSegment.from_wav(in_dir))
        audio.export(out_dir, format='wav')

## Next, generate MFCCs from padded audio clips and store MFCC data in NumPy array

In [8]:
x = np.array([])
voice_data_dir = "uniform_voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        wav_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        y, sr = librosa.load(wav_dir, sr=44100)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        x = np.append(x, mfcc)

(13651200,)

## Reorganize array to be split by actor, audio clip, and MFCC dimensions
Expected # of data points: (# actors) \* (# clips) \* (mfcc shape x) \* (mfcc shape y) = 24 \* 60 \* 20 \* 474 = 13651200

In [9]:
num_actors = 24
num_wav = 60
num_mfcc_x = mfcc[0].size
num_mfcc_y = len(mfcc)

x = x.reshape(num_actors, num_wav, num_mfcc_x, num_mfcc_y)
x.shape

(24, 60, 474, 20)

In [10]:
# Export array to npy file
np.save("audio_data", x)

## Generate NumPu array of features to pair with data

In [11]:
wav_labels = np.array([])
voice_data_dir = "uniform_voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        current_labels = [int(wavs[j][0:2]), int(wavs[j][3:5]), int(wavs[j][6:8]), int(wavs[j][9:11]), int(wavs[j][12:14]), int(wavs[j][15:17]), int(wavs[j][18:20])]
        wav_labels = np.append(wav_labels, current_labels)

wav_labels = wav_labels.reshape(num_actors * num_wav, 7)

(1440, 7)

In [12]:
# Export array to npy file
np.save("wav_labels", wav_labels)