In [1]:
"""



To use this script, make the following preparations:
- Create a working directory
- Inside the working directory, have a single ..._clips.npy file. This file will correspond to which
  character's model the generated data will be used to train.
- Inside the working directory, have a single folder named 'overlay_clips'
- Inside the overlay_clips directory, include one or more ..._clips.npy files of other characters. This will
  be used generate data for the known character's model to differentiate from other characters.
- Inside the working directory, create an empty subdirectory named 'output'
"""

import numpy as np
import librosa
import os
import pydub
import random
import math
import pickle

from scipy.io.wavfile import write



In [2]:
"""
Specify script parameters
"""
# specify the working directory; note that directory notation must use '/' rather than '\'
wd = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/'

# initially using 70%/15%/15% proportions for training, validation, and testing
train_proportion = 0.7
val_proportion = 0.15

# number of MFCC features
# 20 is the default for the librosa .wav->MFCC feature extraction
# 39 seems to be normal for speech recognition (but not reconstruction)
# 64 seems to be the minimum for good speech reconstruction, but the reconstruction can be lengthy
n_mfcc = 64

# MFCC framing of audio frames
# 2048 and 512 are the default values, respectively
win_length=2048
hop_length=512

# the sampling rate to use; default is 22050
sr = 22050

In [3]:
# this will store the known character's clips
known_char_clips = None

# this will store the clips for all other characters
other_char_clips_array = []

# navigate the specified working directory to determine known character and other character clips arrays
wd_contents = os.listdir(wd)
wd_overlays_contents = os.listdir(wd+'overlay_clips')

for item in wd_contents:
    if os.path.isfile(os.path.join(wd, item)):
        print('For known character data: %s\n' % (item))
        
        # load the known character's ...clips.npy file
        with open(wd+item, 'rb') as f:
            known_char_clips = np.load(f)
            
print('For other character data:')
for item in wd_overlays_contents:
    # load the current other character's ...clips.npy file
        with open(wd+'overlay_clips/'+item, 'rb') as f:
            other_char_clips_array.append(np.load(f))
            
        print('\t%s' % (item))
    
            
other_char_clips_array = np.array(other_char_clips_array)

For known character data: id10016_merged_cleaned.wav_clips.npy

For other character data:
	id10130_merged_cleaned.wav_clips.npy
	id10168_merged_cleaned.wav_clips.npy
	id10484_merged_cleaned.wav_clips.npy


In [4]:
print('known_char_clips.shape:', known_char_clips.shape)
print('other_char_clips_array.shape:', other_char_clips_array.shape)

known_char_clips.shape: (1497, 88200)
other_char_clips_array.shape: (3, 1497, 88200)


In [5]:
"""
Shuffle all clips for all characters and set up the clips to be overlayed.
Because clip overlaying is based on averaging values, overlaying a known char clip
with itself returns itself.

There are n+1 overlay options that are selected randomly, where n is the number of
other characters to choose from. The +1 is the choice of no overlay (by overlaying with
itself).
"""

# we need to shuffle order of the clips
# shuffle known character
random.shuffle(known_char_clips)
# shuffle other characters
for index in range(0, len(other_char_clips_array)):
    random.shuffle(other_char_clips_array[index])
    

# overlay_clips stores the clips to overlay the known character
overlay_clips = []

# randomly select clips from the other characters to append to overlay clips
# can also do the same clip as the known character (which represents no overlay, just the normal voice)
# all scenarios have equal chance
while len(overlay_clips) < len(known_char_clips):
    # determine random index
    # randint() is inclusive; if the int is out of bounds we interpret this as no overlay
    rand_index = random.randint(0, len(other_char_clips_array))
    
    # no overlay
    if rand_index == len(other_char_clips_array):
        # just duplicate the clip from the known char at the same index
        overlay_clips.append( known_char_clips[len(overlay_clips)])
        
    # other character's clip overlay
    else:
        overlay_clips.append( other_char_clips_array[rand_index][len(overlay_clips)])

In [6]:
"""
It should be noted that this only overlays the main character's clip with up to one other clip. 
It does not currently stack more than that.
"""

# the list of arrays to be overlayed (averaged)
list_to_overlay = [known_char_clips, overlay_clips]

output_clips = np.array(sum(list_to_overlay)/len(list_to_overlay))
print('output_clips.shape:', output_clips.shape)

output_clips.shape: (1497, 88200)


In [7]:
# # TMP TESTING

# for i in range(0, 5): 
#     loc = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/test_overlay'+str(i)+'.wav'
#     write(loc, 22050, output_clips[i])
    
#     # okay, or overlay clips are not generated correctly
#     loc2 = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/just_overlay_clip'+str(i)+'.wav'
#     write(loc2, 22050, overlay_clips[i])
    
    
# # confirmed they are not all the same
# # write('F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/known_char_4.wav', 
# #       22050, known_char_clips[4])
# # write('F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/known_char_1004.wav', 
# #       22050, known_char_clips[1004])

In [17]:
"""
Currently the clips are separated. Let's append them all end-to-end such that the arrays
are 1D.
"""

# flatten
known_char_clips = known_char_clips.flatten()
output_clips = output_clips.flatten()

In [18]:
"""
I think we should try to maintain a 70%/15%/15% split for training, validation, and testing data.
Toward this end, I think that no clips that appear in one bracket should appear in another, even if
we are shuffling things.

These proportions can be specified above in the parameter section.
"""

# determine clip array slicing
end_frame_train = math.ceil(train_proportion * len(output_clips))
end_frame_val = end_frame_train + math.ceil(val_proportion * len(output_clips))

# slice the known character clips
known_char_clips_for_train = known_char_clips[0:end_frame_train]
known_char_clips_for_val = known_char_clips[end_frame_train:end_frame_val]
known_char_clips_for_test = known_char_clips[end_frame_val:]

# slice the output clips
output_clips_for_train = output_clips[0:end_frame_train]
output_clips_for_val = output_clips[end_frame_train:end_frame_val]
output_clips_for_test = output_clips[end_frame_val:]

In [19]:
print('Training frames: %i (0 through %i) \nValidation frames: %i (%i through %i) \nTest frames: %i (%i through %i)'
     % (len(output_clips_for_train), end_frame_train-1, len(output_clips_for_val), end_frame_train, end_frame_val-1,
       len(output_clips_for_test), end_frame_val, len(output_clips)))

print()
print('Training proportion: %i/%i = %f \nValidation proportion: %i/%i = %f \nTesting proportion: %i/%i = %f'
     % (len(output_clips_for_train), len(output_clips), len(output_clips_for_train)/len(output_clips),
        len(output_clips_for_val), len(output_clips), len(output_clips_for_val)/len(output_clips),
        len(output_clips_for_test), len(output_clips), len(output_clips_for_test)/len(output_clips)))

Training frames: 92424780 (0 through 92424779) 
Validation frames: 19805310 (92424780 through 112230089) 
Test frames: 19805310 (112230090 through 132035400)

Training proportion: 92424780/132035400 = 0.700000 
Validation proportion: 19805310/132035400 = 0.150000 
Testing proportion: 19805310/132035400 = 0.150000


In [20]:
"""
Save the six arrays to file in the form of a dict in the output directory. These are the overlapping audio clips.
"""

names_to_save = ['train_input', 'val_input', 'test_input', 'train_labels', 'val_labels', 'test_labels']
arrays_to_save = [output_clips_for_train, output_clips_for_val, output_clips_for_test,
                  known_char_clips_for_train, known_char_clips_for_val, known_char_clips_for_test]

# prepare the audio arrays for storing to file
audio_output_dict = {}
for index in range(0, len(names_to_save)):
    audio_output_dict[names_to_save[index]] = arrays_to_save[index]   

# save the audio dict to file
with open(wd+'output/audio_training_data_dict.pickle', 'wb') as f:
    pickle.dump(audio_output_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
mfcc_output_dict = {}

# generate MFCCs
for index in range(0, len(arrays_to_save)):
#     tmp_list = []
    
#     for clip in arrays_to_save[index]:
#         tmp_list.append(librosa.feature.mfcc(
#             y=clip, 
#             sr=sr,
#             S=None, 
#             n_mfcc=n_mfcc, 
#             dct_type=2, 
#             norm='ortho', 
#             lifter=0, 
#             win_length=win_length, # default is 2048; with sr of 22050 audio frames/sec, this corresponds to ~93ms
#             hop_length=hop_length  # default is 512; with sr of 22050 audio frames/sec, this corresponds to ~23ms
#         ))
                
    mfcc_array = librosa.feature.mfcc(
            y=arrays_to_save[index], 
            sr=sr,
            S=None, 
            n_mfcc=n_mfcc, 
            dct_type=2, 
            norm='ortho', 
            lifter=0, 
            win_length=win_length, # default is 2048; with sr of 22050 audio frames/sec, this corresponds to ~93ms
            hop_length=hop_length  # default is 512; with sr of 22050 audio frames/sec, this corresponds to ~23ms
        )
    mfcc_output_dict[names_to_save[index]] = np.swapaxes(np.array(mfcc_array), 0, 1)
    
        
# save the mfcc dict to file
with open(wd+'output/mfcc_training_data_dict.pickle', 'wb') as f:
    pickle.dump(mfcc_output_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
with open(wd+'output/audio_training_data_dict.pickle', 'rb') as f:
    reloaded_dict = pickle.load(f)
    print('audio_training_data_dict.shape:', reloaded_dict.get('train_input').shape)

with open(wd+'output/mfcc_training_data_dict.pickle', 'rb') as f:
    reloaded_dict = pickle.load(f)
    print('mfcc_training_data_dict.shape:', reloaded_dict.get('train_input').shape)

audio_training_data_dict.shape: (92424780,)
mfcc_training_data_dict.shape: (180518, 64)
