In [1]:
import os
import sys
import random

import shutil
from pathlib import Path
from math import ceil
from pydub import AudioSegment

import numpy as np
import tensorflow as tf
from tensorflow.python.keras import backend as K
from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
import matplotlib.pyplot as plt
from torch.utils.data import Dataset


Using TensorFlow backend.


In [2]:
AUDIO_PATH = 'revised_audio'
SOURCE_DIR = 'accents' #'sherlock_holmes'
VERBOSE = True

### First, trim large audio segments into 1 second clips

In [3]:
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)


def split_audio(input_path: str, length=1, filetype='.wav'):
    """given a directory containing several long audio files, trim every
    clip into 1 second incremenets and output to a folder

    """
    # first delete all existing audio at location
    cleaned_name = input_path.split('/')[-1]
    split_path = f'{AUDIO_PATH}/{cleaned_name}_split'
    shutil.rmtree(split_path, ignore_errors=True)
    Path(split_path).mkdir(parents=True, exist_ok=True)

    inputs = [f for f in os.listdir(input_path) if f != '.DS_Store']

    error_count = 0
    for i, input_name in enumerate(inputs):
        path = f'{input_path}/{input_name}'
        name = input_name.replace(filetype, '')

        try:
            audio = AudioSegment.from_file(path)
        except Exception as e:
            print(f'WARN: Unable to process {input_name}/{name}')
            continue

        # measured in ms
        audio_length_s = len(audio) / 1000

        # split clips into 10s clips
        num_clips = ceil(audio_length_s / length)

        trimmed_clips = []
        prev_end = 0
        for clip_num in range(1, num_clips):
            trimmed_clip = audio[prev_end:clip_num * length * 1_000]
            trimmed_clips.append(trimmed_clip)
            prev_end = clip_num * length * 1_000
        
        path = f'{AUDIO_PATH}/{cleaned_name}_split/{name}'
        clean_path = path.replace('.mp3', '')
        clean_path = clean_path.replace('.wav', '')
        Path(clean_path).mkdir(parents=True, exist_ok=True)

        for clip_num, clip in enumerate(trimmed_clips):
            clip = match_target_amplitude(clip, -20.0)
            clip.export(f'{clean_path}/{clip_num}.wav', format='wav')

        # reduce verbosity
        if i % 50 == 0 or VERBOSE:
            print(f'INFO: Completed clip generation for speaker {input_name} [{i + 1}/{len(inputs)}]')

In [4]:
split_audio(f'{AUDIO_PATH}/{SOURCE_DIR}', length=1.5)

1.mp3 [1851/2138]
INFO: Completed clip generation for speaker bosnian8.mp3 [1852/2138]
INFO: Completed clip generation for speaker english175.mp3 [1853/2138]
INFO: Completed clip generation for speaker romanian10.mp3 [1854/2138]
INFO: Completed clip generation for speaker french48.mp3 [1855/2138]
INFO: Completed clip generation for speaker hindi11.mp3 [1856/2138]
INFO: Completed clip generation for speaker serbian1.mp3 [1857/2138]
INFO: Completed clip generation for speaker polish23.mp3 [1858/2138]
INFO: Completed clip generation for speaker english149.mp3 [1859/2138]
INFO: Completed clip generation for speaker bulgarian5.mp3 [1860/2138]
INFO: Completed clip generation for speaker urdu4.mp3 [1861/2138]
INFO: Completed clip generation for speaker spanish116.mp3 [1862/2138]
INFO: Completed clip generation for speaker gujarati3.mp3 [1863/2138]
INFO: Completed clip generation for speaker english388.mp3 [1864/2138]
INFO: Completed clip generation for speaker pashto3.mp3 [1865/2138]
INFO: Co

### Generate MFCCs from each audio clip, check to see if any data is lost

In [7]:
def generate_mfcc_for_dir(split_path):
    mfcc_path = split_path.replace('split', 'mfcc')
    shutil.rmtree(mfcc_path, ignore_errors=True)
    Path(mfcc_path).mkdir(parents=True, exist_ok=True)

    speakers = os.listdir(split_path)
    speakers = [s for s in speakers if s != '.DS_Store']

    for i, speaker in enumerate(speakers):
        speaker_path = f'{split_path}/{speaker}'
        speaker_mfcc_path = f'{mfcc_path}/{speaker}'
        Path(speaker_mfcc_path).mkdir(parents=True, exist_ok=True)
        clips = [c for c in os.listdir(speaker_path) if c != '.DS_Store']
        for clip in clips:
            # dimensions of full_mfcc are ~(120, 64), so we are not losing any data when padding
            try:
                full_mfcc = read_mfcc(f'{speaker_path}/{clip}', SAMPLE_RATE)
                sampled_mfcc = sample_from_mfcc(full_mfcc, NUM_FRAMES)

                clip_name = clip.replace('.wav', '.npy')
                np.save(f'{speaker_mfcc_path}/{clip_name}', sampled_mfcc)
            except Exception as e:
                print(f'WARN: Error generating MFCC for speaker {speaker} clip {clip}')

        if i % 50 == 0 or VERBOSE:
            print(f'INFO: Completed MFCC generation for speaker {speaker} [{i + 1}/{len(speakers)}]')

In [8]:
generate_mfcc_for_dir(f'{AUDIO_PATH}/{SOURCE_DIR}_split')

ted MFCC generation for speaker english308 [1834/2138]
INFO: Completed MFCC generation for speaker mandarin18 [1835/2138]
INFO: Completed MFCC generation for speaker xiang3 [1836/2138]
INFO: Completed MFCC generation for speaker sylheti1 [1837/2138]
INFO: Completed MFCC generation for speaker portuguese38 [1838/2138]
INFO: Completed MFCC generation for speaker spanish20 [1839/2138]
INFO: Completed MFCC generation for speaker arabic50 [1840/2138]
INFO: Completed MFCC generation for speaker english561 [1841/2138]
INFO: Completed MFCC generation for speaker mandarin27 [1842/2138]
INFO: Completed MFCC generation for speaker english105 [1843/2138]
INFO: Completed MFCC generation for speaker dutch3 [1844/2138]
INFO: Completed MFCC generation for speaker english337 [1845/2138]
INFO: Completed MFCC generation for speaker english54 [1846/2138]
INFO: Completed MFCC generation for speaker mandarin9 [1847/2138]
INFO: Completed MFCC generation for speaker fanti3 [1848/2138]
INFO: Completed MFCC gen

### Import DeepSpeaker model

In [9]:
model = DeepSpeakerModel()
model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True)

In [10]:
tf.executing_eagerly()

True

In [13]:
def generate_features(mfcc_path):
    feature_path = mfcc_path.replace('mfcc', 'features')
    Path(feature_path).mkdir(parents=True, exist_ok=True)

    speakers = os.listdir(mfcc_path)
    speakers = [s for s in speakers if s != '.DS_Store']

    for i, speaker in enumerate(speakers):
        speaker_path = f'{mfcc_path}/{speaker}'
        speaker_feature_path = f'{feature_path}/{speaker}'
        Path(speaker_feature_path).mkdir(parents=True, exist_ok=True)
        mfcc_files = [m for m in os.listdir(speaker_path) if m != '.DS_Store']

        # there may already be some features generated. make sure we don't
        # redo work!
        current_features = os.listdir(speaker_feature_path)
        if len(current_features) > 0:
            print(f'INFO: Speaker {speaker} already has {len(current_features)} features. Continuing...')

        mfccs_to_predict = [mfcc_file for mfcc_file in mfcc_files if mfcc_file not in current_features]
        mfccs_to_predict.sort()

        for mfcc_idx, mfcc_file in enumerate(mfccs_to_predict):
            mfcc = np.load(f'{speaker_path}/{mfcc_file}')
            
            # generate prediction from this mfcc (TODO: Data augmentation step)           
            features = model.m.predict(np.expand_dims(mfcc, axis=0))

            # this outputs a tensor
            #features = model.m(np.expand_dims(mfcc, axis=0))

            # features comes in as (1, 512) for some reason
            features = features[0]

            # we are saving another npy file, so we can reuse the same name because
            # we're saving to a different directory
            np.save(f'{speaker_feature_path}/{mfcc_file}', features)
            #print(f'INFO: Wrote prediction {mfcc_file} for {speaker}: [{mfcc_idx + 1}/{len(mfccs_to_predict)}]')

        if i % 50 == 0 or VERBOSE:
            print(f'INFO: Completed feature generation for speaker {speaker} [{i + 1}/{len(speakers)}]')        

In [14]:
generate_features(f'{AUDIO_PATH}/{SOURCE_DIR}_mfcc')

ation for speaker mandarin9 [1847/2138]
INFO: Completed feature generation for speaker fanti3 [1848/2138]
INFO: Completed feature generation for speaker bengali4 [1849/2138]
INFO: Completed feature generation for speaker hebrew2 [1850/2138]
INFO: Completed feature generation for speaker english568 [1851/2138]
INFO: Completed feature generation for speaker arabic92 [1852/2138]
INFO: Completed feature generation for speaker slovak2 [1853/2138]
INFO: Completed feature generation for speaker gujarati9 [1854/2138]
INFO: Completed feature generation for speaker thai14 [1855/2138]
INFO: Completed feature generation for speaker tagalog5 [1856/2138]
INFO: Completed feature generation for speaker luo1 [1857/2138]
INFO: Completed feature generation for speaker spanish16 [1858/2138]
INFO: Completed feature generation for speaker arabic66 [1859/2138]
INFO: Completed feature generation for speaker english133 [1860/2138]
INFO: Completed feature generation for speaker serbian13 [1861/2138]
INFO: Compl