# Proccessing WAVs Into Large MFCC Data
Garrett Faucher

In [26]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pydub import AudioSegment
import tensorflow as tf

%matplotlib inline

## First, trim audio to remove silence

In [27]:
def detect_leading_silence(sound, silence_threshold=-65.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [28]:
def trim(file_path):
    sound = AudioSegment.from_file(file_path, format="wav")

    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())

    duration = len(sound)    
    trimmed_sound = sound[start_trim:duration-end_trim]
    return trimmed_sound

In [29]:
# Trimming the audio files so blank audio is removed
voice_data_dir = "voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        in_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        out_dir = "trimmed_" + in_dir
        audio = trim(in_dir)
        audio.export(out_dir, format='wav')

## Next, pad the dataset to have audio clips of the same length for all samples

In [22]:
# Padding the audio files so shape is the same for all audio entries
voice_data_dir = "voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        in_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        out_dir = "uniform_" + in_dir
        audio = AudioSegment.silent(duration=4395)
        audio = audio.overlay(AudioSegment.from_wav(in_dir))
        audio.export(out_dir, format='wav')

## Lastly, generate MFCCs from padded audio clips and store MFCC data in NumPy array

In [32]:
x = np.array([])
voice_data_dir = "uniform_voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        wav_dir = voice_data_dir + "/" + dir_actors[i] + "/" + wavs[j]
        y, sr = librosa.load(wav_dir, sr=176400)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=30)
        x = np.append(x, mfcc)

## Reorganize array to be split by actor, audio clip, and MFCC dimensions
Expected # of data points: (# actors) \* (# clips) \* (mfcc shape x) \* (mfcc shape y) = 24 \* 60 \* 20 \* 474 = 13651200

In [33]:
num_actors = 24
num_wav = 60
num_mfcc_x = mfcc[0].size
num_mfcc_y = len(mfcc)

x = x.reshape(num_actors, num_wav, num_mfcc_x, num_mfcc_y)
x.shape

(24, 60, 1515, 30)

In [34]:
# Export array to npy file
np.save("audio_data_long", x)

## Generate NumPy array of features to pair with data

In [6]:
wav_labels = np.array([])
voice_data_dir = "uniform_voice_data"
dir_actors = os.listdir(voice_data_dir)

for i in range(len(dir_actors)):
    wavs = os.listdir(voice_data_dir + "/" + dir_actors[i])
    for j in range(len(wavs)):
        current_labels = [int(wavs[j][0:2]), int(wavs[j][3:5]), int(wavs[j][6:8]), int(wavs[j][9:11]), int(wavs[j][12:14]), int(wavs[j][15:17]), int(wavs[j][18:20])]
        wav_labels = np.append(wav_labels, current_labels)

wav_labels = wav_labels.reshape(num_actors * num_wav, 7)

In [7]:
# Export array to npy file
np.save("wav_labels", wav_labels)