In [1]:
"""



To use this script, make the following preparations:
- Create a working directory
- Inside the working directory, have a single ..._clips.npy file. This file will correspond to which
  character's model the generated data will be used to train.
- Inside the working directory, have a single folder named 'overlay_clips'
- Inside the overlay_clips directory, include one or more ..._clips.npy files of other characters. This will
  be used generate data for the known character's model to differentiate from other characters.
- Inside the working directory, create an empty subdirectory named 'output'


03/26/22
Let's split the known character's training data into 4 second clips with 0.5 second jumps inbetween. 
We can reuse each of these 4 second clips 4 times?

Then, we randomly select one of the overlay clips, and then randomly select a 4 second segment from that clip,
overlaying that with the 4 second clip from the known character.

This time, instead of pickling everything, we need to store it as raw .wav files and iteratively construct
a metadata file (.csv I think?).
We will need different metadata for training, validation, and testing.
I am unsure how to split these. I think we will generate all audio, and then randomly determine which goes where.
I think we can have three folders, "overlay", "char", and "noise", and then the three metadata files can use
the same folders but access different files.

To avoid overloading/wasting system resouces, we cannot open all of the overlay clips at once. We can iterate 
over the directory to collect their names, and then open and close them as needed.
"""

import numpy as np
import librosa
import os
import pydub
import random
import math
# import pickle

from scipy.io.wavfile import write

In [19]:
"""
Specify script parameters
"""
# specify the working directory; note that directory notation must use '/' rather than '\'
wd = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id20/'

# initially using 70%/15%/15% proportions for training, validation, and testing
train_proportion = 0.7
val_proportion = 0.15

# the sampling rate to use; default is 22050
sr = 22050

desired_samples = 100
sample_length =4 # seconds
sample_framecount = sr * sample_length

In [3]:
# this will store the known character's clips
known_char_wav = None
sr = None

# this will store the names of the clips for all other characters
other_char_clips_array = []

# navigate the specified working directory to determine known character and other character clips arrays
wd_contents = os.listdir(wd)
wd_overlays_contents = os.listdir(wd+'overlay_clips')

for item in wd_contents:
    if os.path.isfile(os.path.join(wd, item)):
        print('For known character data: %s\n' % (item))
        
        # load the known character's ...clips.npy file
        with open(wd+item, 'rb') as f:
            known_char_wav, sr = librosa.load(f)
            
print('For other character data:')
for item in wd_overlays_contents:
        other_char_clips_array.append(wd+'overlay_clips/'+item)
            
        print('\t%s' % (item))

For known character data: id10020_merged.wav

For other character data:
	id10016_merged.wav
	id10018_merged.wav
	id10045_merged.wav
	id10104_merged.wav
	id10107_merged.wav
	id10130_merged.wav
	id10148_merged.wav
	id10168_merged.wav
	id10254_merged.wav
	id10321_merged.wav
	id10343_merged.wav
	id10352_merged.wav
	id10353_merged.wav
	id10356_merged.wav
	id10376_merged.wav
	id10449_merged.wav
	id10484_merged.wav
	id10486_merged.wav
	id10525_merged.wav
	id10535_merged.wav
	id10715_merged.wav
	id10756_merged.wav
	id10758_merged.wav
	id10786_merged.wav
	id10856_merged.wav
	id10873_merged.wav
	id10918_merged.wav
	id10921_merged.wav
	id10929_merged.wav
	id10931_merged.wav
	id10983_merged.wav
	id10996_merged.wav
	id10997_merged.wav
	id11022_merged.wav
	id11027_merged.wav
	id11045_merged.wav
	id11072_merged.wav
	id11088_merged.wav
	id11105_merged.wav
	id11120_merged.wav
	id11166_merged.wav
	id11182_merged.wav
	id11211_merged.wav
	id11234_merged.wav


In [17]:
duration = librosa.get_duration(y=known_char_wav, sr=sr)
hours = duration // 3600
duration = duration % 3600
minutes = duration // 60
duration = duration % 60
print('known_char_wav duration: %i:%i:%i' % (hours, minutes, duration))
print('known_char_wav frame count: %i' % (known_char_wav.shape))
print('other_char_clips_array length:', len(other_char_clips_array))

known_char_wav duration: 1:15:9
known_char_wav frame count: 99441902
other_char_clips_array length: 44
(99441902,)


In [None]:
metadata_array = []
char_framecount = known_char_wav.shape[0]

desired_samples = 2

while len(metadata_array) < desired_samples:
    # determine the known character's clip
    rand_frame = random.randint(0, char_framecount)
    
    char_clip = None
    overrun = rand_frame + sample_framecount - char_framecount
    
    # slice out the clip for the known character based on the randomly selected starting frame
    # if the starting frame is close to the end of the .wav file, loop back to the beginnning
    if overrun > 0:
        char_clip = np.concatenate((known_char_wav[rand_frame:], known_char_wav[0:overrun]))
    else:
        char_clip = known_char_wav[rand_frame:rand_frame+sample_framecount]
        
        
        
    # randomly determine which other character's .wav file to use
    rand_wav_index = random.randint(0, len(other_char_clips_array))
    other_char_wav, _ = librosa.load(other_char_clips_array[rand_wav_index])
    
    noise_framecount = other_char_wav.shape[0]
    rand_frame = random.randint(0, noise_framecount)
    
    noise_clip = None
    overrun = rand_frame + sample_framecount - noise_framecount
    
    if overrun > 0:
        noise_clip = np.concatenate((other_char_wav[rand_frame:], other_char_wav[0:overrun]))
    else:
        noise_clip = other_char_wav[rand_frame:rand_frame+sample_framecount]
        
    print(char_clip.shape)
    print(noise_clip.shape)


(88200,)
(88200,)


In [6]:
## REFERENCE CODE BELOW ##

In [None]:
"""
Shuffle all clips for all characters and set up the clips to be overlayed.
Because clip overlaying is based on averaging values, overlaying a known char clip
with itself returns itself.

There are n+1 overlay options that are selected randomly, where n is the number of
other characters to choose from. The +1 is the choice of no overlay (by overlaying with
itself).
"""

# we need to shuffle order of the clips
# shuffle known character
random.shuffle(known_char_clips)
# shuffle other characters
for index in range(0, len(other_char_clips_array)):
    random.shuffle(other_char_clips_array[index])
    

# overlay_clips stores the clips to overlay the known character
overlay_clips = []

# randomly select clips from the other characters to append to overlay clips
# can also do the same clip as the known character (which represents no overlay, just the normal voice)
# all scenarios have equal chance
while len(overlay_clips) < len(known_char_clips):
    # determine random index
    # randint() is inclusive; if the int is out of bounds we interpret this as no overlay
    rand_index = random.randint(0, len(other_char_clips_array))
    
    # no overlay
    if rand_index == len(other_char_clips_array):
        # just duplicate the clip from the known char at the same index
        overlay_clips.append( known_char_clips[len(overlay_clips)])
        
    # other character's clip overlay
    else:
        overlay_clips.append( other_char_clips_array[rand_index][len(overlay_clips)])

In [None]:
"""
It should be noted that this only overlays the main character's clip with up to one other clip. 
It does not currently stack more than that.
"""

overlay_clips = np.array(overlay_clips)

# the list of arrays to be overlayed (averaged)
list_to_overlay = [known_char_clips, overlay_clips]

output_clips = np.array(sum(list_to_overlay)/len(list_to_overlay))
print('output_clips.shape:', output_clips.shape)

In [None]:
# # TMP TESTING

# for i in range(0, 5): 
#     loc = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/test_overlay'+str(i)+'.wav'
#     write(loc, 22050, output_clips[i])
    
#     # okay, or overlay clips are not generated correctly
#     loc2 = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/just_overlay_clip'+str(i)+'.wav'
#     write(loc2, 22050, overlay_clips[i])
    
    
# # confirmed they are not all the same
# # write('F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/known_char_4.wav', 
# #       22050, known_char_clips[4])
# # write('F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/known_char_1004.wav', 
# #       22050, known_char_clips[1004])

In [None]:
"""
Currently the clips are separated. Let's append them all end-to-end such that the arrays
are 1D.
"""

# flatten
if flatten:
    known_char_clips = known_char_clips.flatten()
    overlay_clips = overlay_clips.flatten()
    output_clips = output_clips.flatten()

In [None]:
"""
I think we should try to maintain a 70%/15%/15% split for training, validation, and testing data.
Toward this end, I think that no clips that appear in one bracket should appear in another, even if
we are shuffling things.

These proportions can be specified above in the parameter section.
"""

# determine clip array slicing
end_frame_train = math.ceil(train_proportion * len(output_clips))
end_frame_val = end_frame_train + math.ceil(val_proportion * len(output_clips))

# slice the known character clips
known_char_clips_for_train = known_char_clips[0:end_frame_train]
known_char_clips_for_val = known_char_clips[end_frame_train:end_frame_val]
known_char_clips_for_test = known_char_clips[end_frame_val:]

# slice the overlay (noise) clips
noise_clips_for_train = overlay_clips[0:end_frame_train]
noise_clips_for_val = overlay_clips[end_frame_train:end_frame_val]
noise_clips_for_test = overlay_clips[end_frame_val:]

# slice the output clips
output_clips_for_train = output_clips[0:end_frame_train]
output_clips_for_val = output_clips[end_frame_train:end_frame_val]
output_clips_for_test = output_clips[end_frame_val:]

In [None]:
print('Training frames: %i (0 through %i) \nValidation frames: %i (%i through %i) \nTest frames: %i (%i through %i)'
     % (len(output_clips_for_train), end_frame_train-1, len(output_clips_for_val), end_frame_train, end_frame_val-1,
       len(output_clips_for_test), end_frame_val, len(output_clips)))

print()
print('Training proportion: %i/%i = %f \nValidation proportion: %i/%i = %f \nTesting proportion: %i/%i = %f'
     % (len(output_clips_for_train), len(output_clips), len(output_clips_for_train)/len(output_clips),
        len(output_clips_for_val), len(output_clips), len(output_clips_for_val)/len(output_clips),
        len(output_clips_for_test), len(output_clips), len(output_clips_for_test)/len(output_clips)))

In [None]:
"""
Save the six arrays to file in the form of a dict in the output directory. These are the overlapping audio clips.
"""

names_to_save = ['train_input', 'val_input', 'test_input', 'train_targets', 'val_targets', 'test_targets', 
                 'train_noise', 'val_noise', 'test_noise']
arrays_to_save = [output_clips_for_train, output_clips_for_val, output_clips_for_test,
                  known_char_clips_for_train, known_char_clips_for_val, known_char_clips_for_test,
                  noise_clips_for_train, noise_clips_for_val, noise_clips_for_test]

# prepare the audio arrays for storing to file
audio_output_dict = {}
for index in range(0, len(names_to_save)):
    audio_output_dict[names_to_save[index]] = arrays_to_save[index]   

# save the audio dict to file
savename = 'flat_' if flatten else ''
savename = savename + 'audio_training_data_dict.pickle'
with open(wd+'output/'+savename, 'wb') as f:
    pickle.dump(audio_output_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
mfcc_output_dict = {}

# generate MFCCs
for index in range(0, len(arrays_to_save)):    
    if flatten:
        mfcc_array = librosa.feature.mfcc(
                y=arrays_to_save[index], 
                sr=sr,
                S=None, 
                n_mfcc=n_mfcc, 
                dct_type=2, 
                norm='ortho', 
                lifter=0, 
                win_length=win_length, # default is 2048; with sr of 22050 audio frames/sec, this corresponds to ~93ms
                hop_length=hop_length  # default is 512; with sr of 22050 audio frames/sec, this corresponds to ~23ms
            )
        mfcc_output_dict[names_to_save[index]] = np.array(mfcc_array) # np.swapaxes(np.array(mfcc_array), 0, 1)
        
    else:
        tmp_list = []
        
        for clip in arrays_to_save[index]:
            tmp_list.append(librosa.feature.mfcc(
                y=clip, 
                sr=sr,
                S=None, 
                n_mfcc=n_mfcc, 
                dct_type=2, 
                norm='ortho', 
                lifter=0, 
                win_length=win_length, # default is 2048; with sr of 22050 audio frames/sec, this corresponds to ~93ms
                hop_length=hop_length  # default is 512; with sr of 22050 audio frames/sec, this corresponds to ~23ms
            ))
            
        mfcc_output_dict[names_to_save[index]] = np.array(tmp_list) # np.swapaxes(np.array(tmp_list), 1, 2)
    
        
# save the mfcc dict to file
savename = 'flat_' if flatten else ''
savename = savename + 'mfcc_training_data_dict.pickle'
with open(wd+'output/'+savename, 'wb') as f:
    pickle.dump(mfcc_output_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(wd+'output/'+savename, 'rb') as f:
    reloaded_dict = pickle.load(f)
    print('audio_training_data_dict.shape:', reloaded_dict.get('train_input').shape)
    


with open(wd+'output/'+savename, 'rb') as f:
    reloaded_dict = pickle.load(f)
    print('mfcc_training_data_dict.shape:', reloaded_dict.get('train_input').shape)