In [23]:
"""



To use this script, make the following preparations:
- Create a working directory
- Inside the working directory, have a single ..._clips.npy file. This file will correspond to which
  character's model the generated data will be used to train.
- Inside the working directory, have a single folder named 'overlay_clips'
- Inside the overlay_clips directory, include one or more ..._clips.npy files of other characters. This will
  be used generate data for the known character's model to differentiate from other characters.
- Inside the working directory, create an empty subdirectory named 'output'
"""

import numpy as np
import librosa
import os
import pydub
import random
import math
import pickle

from scipy.io.wavfile import write

In [2]:
"""
Specify script parameters
"""
# specify the working directory; note that directory notation must use '/' rather than '\'
wd = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/'

# Initially using 70%/15%/15% proportions for training, validation, and testing
train_proportion = 0.7
val_proportion = 0.15

In [4]:
# this will store the known character's clips
known_char_clips = None

# this will store the clips for all other characters
other_char_clips_array = []

# navigate the specified working directory to determine known character and other character clips arrays
wd_contents = os.listdir(wd)
wd_overlays_contents = os.listdir(wd+'overlay_clips')

for item in wd_contents:
    if os.path.isfile(os.path.join(wd, item)):
        print('For known character data: %s\n' % (item))
        
        # load the known character's ...clips.npy file
        with open(wd+item, 'rb') as f:
            known_char_clips = np.load(f)
            
print('For other character data:')
for item in wd_overlays_contents:
    # load the current other character's ...clips.npy file
        with open(wd+'overlay_clips/'+item, 'rb') as f:
            other_char_clips_array.append(np.load(f))
            
        print('\t%s' % (item))
    
            
other_char_clips_array = np.array(other_char_clips_array)

For known character data: id10016_merged_cleaned.wav_clips.npy

For other character data:
	id10130_merged_cleaned.wav_clips.npy
	id10168_merged_cleaned.wav_clips.npy
	id10484_merged_cleaned.wav_clips.npy


In [5]:
print('known_char_clips.shape:', known_char_clips.shape)
print('other_char_clips_array.shape:', other_char_clips_array.shape)

known__char_clips.shape: (1497, 88200)
other_char_clips_array.shape: (3, 1497, 88200)


In [6]:
"""
Shuffle all clips for all characters and set up the clips to be overlayed.
Because clip overlaying is based on averaging values, overlaying a known char clip
with itself returns itself.

There are n+1 overlay options that are selected randomly, where n is the number of
other characters to choose from. The +1 is the choice of no overlay (by overlaying with
itself).
"""

# we need to shuffle order of the clips
# shuffle known character
random.shuffle(known_char_clips)
# shuffle other characters
for index in range(0, len(other_char_clips_array)):
    random.shuffle(other_char_clips_array[index])
    

# overlay_clips stores the clips to overlay the known character
overlay_clips = []

# randomly select clips from the other characters to append to overlay clips
# can also do the same clip as the known character (which represents no overlay, just the normal voice)
# all scenarios have equal chance
while len(overlay_clips) < len(known_char_clips):
    # determine random index
    # randint() is inclusive; if the int is out of bounds we interpret this as no overlay
    rand_index = random.randint(0, len(other_char_clips_array))
    
    # no overlay
    if rand_index == len(other_char_clips_array):
        # just duplicate the clip from the known char at the same index
        overlay_clips.append( known_char_clips[len(overlay_clips)])
        
    # other character's clip overlay
    else:
        overlay_clips.append( other_char_clips_array[rand_index][len(overlay_clips)])

length is now: 1
length is now: 2
length is now: 3
length is now: 4
length is now: 5
length is now: 6
length is now: 7
length is now: 8
length is now: 9
length is now: 10
length is now: 11
length is now: 12
length is now: 13
length is now: 14
length is now: 15
length is now: 16
length is now: 17
length is now: 18
length is now: 19
length is now: 20
length is now: 21
length is now: 22
length is now: 23
length is now: 24
length is now: 25
length is now: 26
length is now: 27
length is now: 28
length is now: 29
length is now: 30
length is now: 31
length is now: 32
length is now: 33
length is now: 34
length is now: 35
length is now: 36
length is now: 37
length is now: 38
length is now: 39
length is now: 40
length is now: 41
length is now: 42
length is now: 43
length is now: 44
length is now: 45
length is now: 46
length is now: 47
length is now: 48
length is now: 49
length is now: 50
length is now: 51
length is now: 52
length is now: 53
length is now: 54
length is now: 55
length is now: 56
l

length is now: 555
length is now: 556
length is now: 557
length is now: 558
length is now: 559
length is now: 560
length is now: 561
length is now: 562
length is now: 563
length is now: 564
length is now: 565
length is now: 566
length is now: 567
length is now: 568
length is now: 569
length is now: 570
length is now: 571
length is now: 572
length is now: 573
length is now: 574
length is now: 575
length is now: 576
length is now: 577
length is now: 578
length is now: 579
length is now: 580
length is now: 581
length is now: 582
length is now: 583
length is now: 584
length is now: 585
length is now: 586
length is now: 587
length is now: 588
length is now: 589
length is now: 590
length is now: 591
length is now: 592
length is now: 593
length is now: 594
length is now: 595
length is now: 596
length is now: 597
length is now: 598
length is now: 599
length is now: 600
length is now: 601
length is now: 602
length is now: 603
length is now: 604
length is now: 605
length is now: 606
length is no

length is now: 1113
length is now: 1114
length is now: 1115
length is now: 1116
length is now: 1117
length is now: 1118
length is now: 1119
length is now: 1120
length is now: 1121
length is now: 1122
length is now: 1123
length is now: 1124
length is now: 1125
length is now: 1126
length is now: 1127
length is now: 1128
length is now: 1129
length is now: 1130
length is now: 1131
length is now: 1132
length is now: 1133
length is now: 1134
length is now: 1135
length is now: 1136
length is now: 1137
length is now: 1138
length is now: 1139
length is now: 1140
length is now: 1141
length is now: 1142
length is now: 1143
length is now: 1144
length is now: 1145
length is now: 1146
length is now: 1147
length is now: 1148
length is now: 1149
length is now: 1150
length is now: 1151
length is now: 1152
length is now: 1153
length is now: 1154
length is now: 1155
length is now: 1156
length is now: 1157
length is now: 1158
length is now: 1159
length is now: 1160
length is now: 1161
length is now: 1162


In [9]:
"""
It should be noted that this only overlays the main character's clip with up to one other clip. 
It does not currently stack more than that.
"""

# the list of arrays to be overlayed (averaged)
list_to_overlay = [known_char_clips, overlay_clips]

output_clips = np.array(sum(list_to_overlay)/len(list_to_overlay))
print('output_clips.shape:', output_clips.shape)

(1497, 88200)


In [10]:
# # TMP TESTING

# for i in range(0, 5): 
#     loc = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/test_overlay'+str(i)+'.wav'
#     write(loc, 22050, output_clips[i])
    
#     # okay, or overlay clips are not generated correctly
#     loc2 = 'F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/just_overlay_clip'+str(i)+'.wav'
#     write(loc2, 22050, overlay_clips[i])
    
    
# # confirmed they are not all the same
# # write('F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/known_char_4.wav', 
# #       22050, known_char_clips[4])
# # write('F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/known_char_1004.wav', 
# #       22050, known_char_clips[1004])

In [18]:
"""
I think we should try to maintain a 70%/15%/15% split for training, validation, and testing data.
Toward this end, I think that no clips that appear in one bracket should appear in another, even if
we are shuffling things.

These proportions can be specified above in the parameter section.
"""

# determine clip array slicing
end_frame_train = math.ceil(train_proportion * len(output_clips))
end_frame_val = end_frame_train + math.ceil(val_proportion * len(output_clips))

# slice the known character clips
known_char_clips_for_train = known_char_clips[0:end_frame_train]
known_char_clips_for_val = known_char_clips[end_frame_train:end_frame_val]
known_char_clips_for_test = known_char_clips[end_frame_val:]

# slice the output clips
output_clips_for_train = output_clips[0:end_frame_train]
output_clips_for_val = output_clips[end_frame_train:end_frame_val]
output_clips_for_test = output_clips[end_frame_val:]

In [22]:
print('Training frames: %i (0 through %i) \nValidation frames: %i (%i through %i) \nTest frames: %i (%i through %i)'
     % (len(output_clips_for_train), end_frame_train-1, len(output_clips_for_val), end_frame_train, end_frame_val-1,
       len(output_clips_for_test), end_frame_val, len(output_clips)))

print()
print('Training proportion: %i/%i = %f \nValidation proportion: %i/%i = %f \nTesting proportion: %i/%i = %f'
     % (len(output_clips_for_train), len(output_clips), len(output_clips_for_train)/len(output_clips),
        len(output_clips_for_val), len(output_clips), len(output_clips_for_val)/len(output_clips),
        len(output_clips_for_test), len(output_clips), len(output_clips_for_test)/len(output_clips)))

Training frames: 1048 (0 through 1047) 
Validation frames: 225 (1048 through 1272) 
Test frames: 224 (1273 through 1497)

Training proportion: 1048/1497 = 0.700067 
Validation proportion: 225/1497 = 0.150301 
Testing proportion: 224/1497 = 0.149633


In [24]:
"""
Save the six arrays to file in the form of a dict in the output directory.
"""

names_to_save = ['train_input', 'val_input', 'test_input', 'train_labels', 'val_labels', 'test_labels']
arrays_to_save = [output_clips_for_train, output_clips_for_val, output_clips_for_test,
                  known_char_clips_for_train, known_char_clips_for_val, known_char_clips_for_test]

audio_output_dict = {}
for index in range(0, len(names_to_save)):
    audio_output_dict[names_to_save[index]] = arrays_to_save[index]
    
mfcc_output_dict = {}
for index in range(0, len(names_to_save)):
    mfcc_output_dict[names_to_save[index]] = arrays_to_save[index]
    
with open(wd+'output/audio_training_data_dict.pickle', 'wb') as f:
    pickle.dump(output_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [64]:
# # https://stackoverflow.com/questions/42492246/how-to-normalize-the-volume-of-an-audio-file-in-python


<_io.BufferedRandom name='F:/ZaknafeinII_Backup_02-02-22/daea/training_data_generation/id16/output/test_overlay_normalized.wav'>

In [25]:
with open(wd+'output/training_data_dict.pickle', 'rb') as f:
    reloaded_dict = pickle.load(f)
    
print(reloaded_dict)
print(reloaded_dict.get('train_input').shape)

{'train_input': array([[ 1.4683174e-04, -2.3227781e-03, -8.5170730e-04, ...,
        -1.7538968e-02, -6.8547260e-03,  2.7653810e-03],
       [ 8.4838867e-03,  7.2326660e-03,  7.3242188e-03, ...,
        -2.6721191e-01, -2.8335571e-01, -2.8576660e-01],
       [ 1.4463626e-03, -8.1922405e-04,  2.7401600e-04, ...,
        -1.9984670e-02, -9.3052043e-03,  2.7387752e-04],
       ...,
       [ 4.5013428e-02,  4.5349121e-02,  4.5288086e-02, ...,
        -3.9672852e-03, -1.6479492e-03, -1.6174316e-03],
       [-2.0271394e-02, -1.8607082e-02, -1.7110532e-02, ...,
         9.9735454e-02,  7.0735320e-02,  6.3908428e-02],
       [-7.1490137e-04,  2.1752482e-04,  3.8339134e-04, ...,
        -9.8149432e-03, -1.1438089e-02, -1.1844619e-02]], dtype=float32), 'val_input': array([[ 0.00798617,  0.00540884,  0.00205768, ..., -0.00775404,
        -0.00725825, -0.00787746],
       [-0.01556396, -0.01248169, -0.0085144 , ...,  0.00726318,
         0.00616455,  0.00463867],
       [ 0.0013793 ,  0.00156799, 