In [3]:
import os
from pathlib import Path
from math import ceil
from pydub import AudioSegment

import os
import sys
import numpy as np
import random
from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity
import tensorflow as tf
import logging
import matplotlib.pyplot as plt
from tabulate import tabulate
from torch.utils.data import Dataset

Using TensorFlow backend.


In [4]:
model = DeepSpeakerModel()
model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True)

In [5]:
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)


def split_audio(input_path: str, length=10, filetype='.wav'):
    """given a directory containing several long audio files, trim every
    clip into 10 second incremenets and output to a folder

    e.g. holmes/ethan.wav will turn into holmes_split/ethan/1.wav, ...
    """
    inputs = [f for f in os.listdir(input_path) if filetype in f]

    for i in inputs:
        path = f'{input_path}/{i}'
        name = i.replace(filetype, '')
        if filetype == '.wav':
            audio = AudioSegment.from_wav(path)
        elif filetype == '.mp3':
            audio = AudioSegment.from_mp3(path)
        else:
            audio = None

        # measured in ms
        audio_length_s = len(audio) / 1000

        # split clips into 10s clips
        num_clips = ceil(audio_length_s / length)

        trimmed_clips = []
        prev_end = 0
        for clip_num in range(1, num_clips):
            trimmed_clip = audio[prev_end:clip_num * length * 1_000]
            trimmed_clips.append(trimmed_clip)
            prev_end = clip_num * length * 1_000

        cleaned_name = input_path.replace('audio/original/', '')

        path = f'audio/split/{cleaned_name}/{name}'
        Path(path).mkdir(parents=True, exist_ok=True)

        for i, clip in enumerate(trimmed_clips):
            clip = match_target_amplitude(clip, -20.0)
            clip.export(f'{path}/{cleaned_name}_{i}.wav', format='wav')

In [6]:
#split_audio('audio/original/SherlockHolmes', length=10)
#split_audio('audio/original/BackgroundNoise', length=10)
split_audio('audio/original/Accents', length=10, filetype='.mp3')

KeyboardInterrupt: 

In [14]:
def single_audio_to_mfcc(path):
    mfcc = sample_from_mfcc(read_mfcc(path, SAMPLE_RATE), NUM_FRAMES)
    print(mfcc.shape)
    predict = model.m.predict(np.expand_dims(mfcc, axis=0))
    new_clip_name = path.replace('wav', 'npy')
    np.save(new_clip_name, predict)

In [15]:
single_audio_to_mfcc('/Users/ethanzh/Code/automatic-speaker-recognition/audio/split/SherlockHolmes/cheryl/noisy_-20_8.wav')

(160, 64, 1)


In [20]:
def cheat_create_and_store_mfcc(dir):
    mfcc_dir = dir.replace('audio', 'mfcc')
    Path(mfcc_dir).mkdir(parents=True, exist_ok=True)

    files = os.listdir(dir)

    for i, audio_file in enumerate(files):
        if 'mp3' not in audio_file:
            continue
        mfcc = sample_from_mfcc(read_mfcc(f'{dir}/{audio_file}', SAMPLE_RATE), NUM_FRAMES)
        predict = model.m.predict(np.expand_dims(mfcc, axis=0))
        new_clip_name = audio_file.replace('mp3', 'npy')
        np.save(f'{mfcc_dir}/{new_clip_name}', predict)

        print(f'[{i + 1}/{len(files)}] - {audio_file}')

cheat_create_and_store_mfcc('audio/split/Accents')

man22.mp3
[1419/2139] - nepali5.mp3
[1420/2139] - uzbek2.mp3
[1421/2139] - russian15.mp3
[1422/2139] - armenian3.mp3
[1423/2139] - russian29.mp3
[1424/2139] - english96.mp3
[1425/2139] - spanish2.mp3
[1426/2139] - turkish5.mp3
[1427/2139] - english82.mp3
[1428/2139] - kazakh1.mp3
[1429/2139] - igbo1.mp3
[1430/2139] - portuguese3.mp3
[1431/2139] - english55.mp3
[1432/2139] - norwegian3.mp3
[1433/2139] - english41.mp3
[1434/2139] - english69.mp3
[1435/2139] - vietnamese6.mp3
[1436/2139] - oromo3.mp3
[1437/2139] - khmer1.mp3
[1438/2139] - english264.mp3
[1439/2139] - japanese2.mp3
[1440/2139] - kurdish6.mp3
[1441/2139] - english502.mp3
[1442/2139] - dutch46.mp3
[1443/2139] - english516.mp3
[1444/2139] - english270.mp3
[1445/2139] - korean38.mp3
[1446/2139] - english258.mp3
[1447/2139] - korean10.mp3
[1448/2139] - mongolian3.mp3
[1449/2139] - satawalese2.mp3
[1450/2139] - german6.mp3
[1451/2139] - teochew1.mp3
[1452/2139] - spanish16.mp3
[1453/2139] - jola1.mp3
[1454/2139] - spanish17.mp3


In [20]:
def create_and_store_mfcc(dir):
    mfcc_dir = dir.replace('audio', 'mfcc')
    Path(mfcc_dir).mkdir(parents=True, exist_ok=True)

    speakers = [f for f in os.listdir(dir) if f != '.DS_Store']


    for i, speaker in enumerate(speakers):
        Path(f'{mfcc_dir}/{speaker}').mkdir(parents=True, exist_ok=True)
        speaker_paths = os.listdir(f'{dir}/{speaker}')

        for j, clip in enumerate(speaker_paths):
            if 'wav' not in clip:
                continue
            mfcc = sample_from_mfcc(read_mfcc(f'{dir}/{speaker}/{clip}', SAMPLE_RATE), NUM_FRAMES)
            predict = model.m.predict(np.expand_dims(mfcc, axis=0))
            new_clip_name = clip.replace('wav', 'npy')
            np.save(f'{mfcc_dir}/{speaker}/{new_clip_name}', predict)

        print(f'[{i + 1}/{len(speakers)}] - {speaker}')

create_and_store_mfcc('audio/split/Accents')

 - pashto5
[1301/2138] - romanian15
[1302/2138] - english78
[1303/2138] - spanish33
[1304/2138] - arabic43
[1305/2138] - macedonian5
[1306/2138] - english129
[1307/2138] - english575
[1308/2138] - cantonese12
[1309/2138] - mandarin33
[1310/2138] - pashto2
[1311/2138] - tatar1
[1312/2138] - english323
[1313/2138] - english111
[1314/2138] - macedonian11
[1315/2138] - baga1
[1316/2138] - urdu4
[1317/2138] - portuguese13
[1318/2138] - english40
[1319/2138] - macedonian2
[1320/2138] - spanish34
[1321/2138] - arabic44
[1322/2138] - romanian12
[1323/2138] - english315
[1324/2138] - english127
[1325/2138] - english543
[1326/2138] - english82
[1327/2138] - zulu1
[1328/2138] - bai1
[1329/2138] - english76
[1330/2138] - portuguese25
[1331/2138] - arabic86
[1332/2138] - polish3
[1333/2138] - greek7
[1334/2138] - english118
[1335/2138] - macedonian18
[1336/2138] - arabic72
[1337/2138] - english49
[1338/2138] - arabic9
[1339/2138] - turkish6
[1340/2138] - macedonian20
[1341/2138] - english71
[1342/2

In [22]:
def combine_audio(first_path, second_path, destination_path, background_noise_amplitude=None):
    sound_1 = AudioSegment.from_file(first_path)
    sound_2 = AudioSegment.from_file(second_path)

    if background_noise_amplitude != None:
        sound_2 = match_target_amplitude(sound_2, background_noise_amplitude)
    
    combined = sound_1.overlay(sound_2)
    
    combined.export(destination_path, format='wav')

In [23]:
import random

def add_noise(path, output_path, background_noise_amplitude=-20):
    # by default, use the same background noise all the time
    combine_audio(path,
                      f'audio/split/BackgroundNoise/kitchen/BackgroundNoise_1.wav',
                      output_path,
                      background_noise_amplitude=background_noise_amplitude)

In [24]:
def create_noisy(input_subject):
    """Given the path to a single subject in SherlockHolmes, generate copies of
    the clip with background noise of level x dB where x is from [-30, 0] inclusive
    in steps of 2"""

    subject_path = f'audio/split/SherlockHolmes/{input_subject}'

    clips = [f for f in os.listdir(subject_path) if f != '.DS_Store']


    for i in range(len(clips)):
        for amplitude in range(-30, -10, 2):
            output_path = f'{subject_path}/noisy_{amplitude}_{i}.wav'

            try:
                add_noise(f'{subject_path}/SherlockHolmes_{i}.wav', output_path, background_noise_amplitude=amplitude)

            except Exception as e:
                print(e)
                pass


In [25]:
dir = 'audio/split/SherlockHolmes'

classes = [f for f in os.listdir(dir) if f != '.DS_Store']

for subject in classes:
    print(subject)
    create_noisy(subject)

mikie
hailey
crystal
mei
changhan
daphne
swadhin
stephanie
cheryl
chad
ethan


In [26]:
def combine_two_speakers(subject_one, subject_two):
    """Given the path to two audio files, combine then into an audio file"""

    subject_one_path = f'audio/split/SherlockHolmes/{subject_one}'
    subject_two_path = f'audio/split/SherlockHolmes/{subject_two}'

    combined_path = f'audio/combined/SherlockHolmes/{subject_one}_{subject_two}'
    Path(combined_path).mkdir(parents=True, exist_ok=True)

    for i in range(1, 8):
        path_one = subject_one_path + f'/{i}.wav'
        path_two = subject_two_path + f'/{i}.wav'

        combined = combined_path + f'/{i}.wav'

        combine_audio(path_one, path_two, combined)