In [5]:
from glob import iglob, glob
import os
import sys
from python_speech_features import mfcc
import numpy as np
import random
import progressbar
import librosa

def data_split(folder, partition_dict=None, seed=78):
    """Split VCTK data into train, dev, test sets.
        Args:
            folder: the folder path to the data (string)
            partition_dict: dictionary for train/dev/test split (default 0.8/0.1/0.1)
        Returns:
            None
    """
    if partition_dict is None:
        partition_dict = {'train':0.8, 'dev':0.1, 'test':0.1}
    assert sum(partition_dict.values()) == 1
    speaker_folders = glob(os.path.join(folder,'wav','*'))
    for speaker_folder in speaker_folders:
        #print(speaker_folder)
        wav_files = glob(os.path.join(speaker_folder, '*.wav' ))
        #print(len(wav_files))
        random.seed(seed)
        random.shuffle(wav_files)
        quantities = [(name, round(ratio*len(wav_files))) for (name, ratio) in partition_dict.items()]
        for name, quantity in quantities:
            #print(quantity)
            for _ in range(quantity):
                try:
                    audio = wav_files.pop()
                    new_path_wav = os.path.join(folder, name, 'wav', speaker_folder.split('/')[-1], os.path.basename(audio))
                    os.renames(audio, new_path_wav)
                    old_path_txt = audio.replace("wav","txt").replace("wav","txt")
                    new_path_txt = new_path_wav.replace("wav","txt").replace("wav","txt")
                    os.renames(old_path_txt, new_path_txt)
                except IndexError as e:
                    pass

def find_files(directory, pattern='**/*.wav'):
    """Recursively finds all files matching the pattern."""
    return sorted(iglob(os.path.join(directory, pattern), recursive=True))

def read_audio_from_filename(filename, sample_rate):
    """Load a wav file and transpose the array."""
    audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.reshape(-1, 1)
    return audio

def convert_txt_index(target_txt):
    """Turn text into index."""
    original = ' '.join(target_txt.strip().lower().split(' ')).replace('.', '').replace('?', '').replace(',', '').replace("'", '').replace('!', '').replace('-', '').replace('\t', '').replace(')', '').replace('"', '')
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')
    # Adding blank label
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])
    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])
    return targets, original

def return_txt_path(wav_path):
    """Return the corresponding txt location for VCTK data set."""
    return wav_path.replace("wav","txt").replace("wav","txt")

def find_speaker_ID(wav_path):
    """Find speaker ID from the path of a wav file."""
    return wav_path.split('.')[0].split('/')[-1]

def pack_data_npz(DIR, input_mfcc, target, speaker_wav_ID, original):
    """Pickle data into npz files."""
    np.savez(os.path.join(DIR, speaker_wav_ID),\
             data_in=input_mfcc, target=target, seq_len=np.array([len(input_mfcc)]), original=np.array([original]))

def convert_wav_mfcc(file, fs):
    """Turn raw audio data into MFCC with sample rate=fs."""
    inputs = mfcc(read_audio_from_filename(file, fs),fs)
    return inputs

In [2]:
#Corpus location
Corpus_dir = '/home/jovyan/work/VCTK-Corpus/'

In [None]:
#Split VCTK data into trian/test/dev set
data_split(Corpus_dir)

In [30]:
#find mean, and varience of training data for normalization
directory = '/home/jovyan/work/VCTK-Corpus/train/wav'

bar = progressbar.ProgressBar()
n =0
sum_mfcc = np.zeros(39) 
sumsq_mfcc = np.zeros(39)
total_len = 0
for file in bar(find_files(directory, pattern='**/*.wav')): 
    audio = mfcc(read_audio_from_filename(file, 16000),samplerate=16000,winlen=0.025,winstep=0.01,numcep=39,
                 nfilt=40)

    sum_mfcc += np.sum(audio, axis = 0)
    sumsq_mfcc +=np.sum(audio*audio, axis = 0)
    total_len += len(audio)
    n += 1

m = sum_mfcc/total_len
v = sumsq_mfcc/(total_len-1) - m*m
s = np.sqrt(v)

print(m)
print(v)
print(s)


100% (34503 of 34503) |###################| Elapsed Time: 1:47:12 Time: 1:47:12


[ -7.51332611e+00  -4.00169070e+00   7.40849740e-01   9.76145029e+00
  -1.65544151e+00   1.59567083e+00  -1.26322944e+00  -1.38907894e+00
   4.64418496e+00   4.55213115e+00   3.82397182e-01   1.79470392e+00
  -2.83868887e+00  -1.21074392e+00  -4.97693218e+00  -1.56840539e-01
  -1.03099064e+00   1.28082167e+00  -1.26283074e+00   1.01066514e+00
   8.03630297e-02   5.38606156e-01  -8.57827423e-02   3.32506741e-03
   1.97867405e-01  -5.21774920e-01   1.38001732e-02  -1.08547029e+00
  -2.26743751e-02  -2.90453150e-01   4.93080491e-01   1.72272966e-01
   8.09385140e-01  -5.41125582e-01   2.79285963e-01  -6.73249785e-01
  -5.44239050e-04  -9.24394111e-02   3.01032922e-02]
[  1.84350572e+01   3.27742824e+02   4.52100067e+02   2.59120249e+02
   4.55738787e+02   4.61610212e+02   4.11192739e+02   4.66312878e+02
   3.13548502e+02   2.97562939e+02   2.33614791e+02   2.50528736e+02
   1.98591050e+02   1.58439708e+02   1.36510094e+02   1.01707235e+02
   6.63688379e+01   4.50270724e+01   3.17288357e+0

In [31]:
print(total_len)

11916812


In [12]:
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1 

In [13]:
Train_DIR = '/home/jovyan/work/MFCC_39_16khz/train/'
directory = '/home/jovyan/work/VCTK-Corpus/train/'

In [14]:
m = np.array([ -7.51332611e+00,  -4.00169070e+00,   7.40849740e-01,   9.76145029e+00,
  -1.65544151e+00,   1.59567083e+00,  -1.26322944e+00,  -1.38907894e+00,
   4.64418496e+00,   4.55213115e+00,   3.82397182e-01,   1.79470392e+00,
  -2.83868887e+00,  -1.21074392e+00,  -4.97693218e+00,  -1.56840539e-01,
  -1.03099064e+00,   1.28082167e+00,  -1.26283074e+00,   1.01066514e+00,
   8.03630297e-02,   5.38606156e-01,  -8.57827423e-02,   3.32506741e-03,
   1.97867405e-01,  -5.21774920e-01,   1.38001732e-02,  -1.08547029e+00,
  -2.26743751e-02,  -2.90453150e-01,   4.93080491e-01,   1.72272966e-01,
   8.09385140e-01,  -5.41125582e-01,   2.79285963e-01,  -6.73249785e-01,
  -5.44239050e-04,  -9.24394111e-02,   3.01032922e-02])
s = np.array([  4.29360655,  18.10366881,  21.26264487,  16.09721247,  21.34803941,
  21.48511605,  20.27788793,  21.59427883,  17.7073008,   17.25001272,
  15.28446241,  15.82809957,  14.09223369,  12.58728358,  11.68375343,
  10.08500048,   8.14670718,   6.71022149,   5.63283549,   4.23743571,
   3.0205074,    1.90752181,   0.71521341,   0.37545172,   1.37520613,
   2.24487373,   3.03946431,   3.7460076,    4.17240994,   4.56583527,
   4.83715023,   4.99518288,   5.02659311,   4.78371701,   4.3470543,
   4.0579483,    3.85892456,   3.39243375,   2.9300941 ])

In [15]:
# Pickle Training data

if not os.path.exists(Train_DIR):
    os.makedirs(Train_DIR)
    
bar = progressbar.ProgressBar()
for wav_path in bar(find_files(directory, pattern='**/*.wav')):
    #print(wav_path)
    speaker_ID = find_speaker_ID(wav_path)
    #print(speaker_ID)
    txt_path = return_txt_path(wav_path)
    target, original = convert_txt_index(open(txt_path).read().strip())
    inputs = convert_wav_mfcc(wav_path, 16000)
    normalize_inputs = (inputs - m)/s
    pack_data_npz(Train_DIR, normalize_inputs, target, speaker_ID, original)


100% (34503 of 34503) |###################| Elapsed Time: 1:23:04 Time: 1:23:04


In [19]:
# Pickle Dev data

DIR = '/home/jovyan/work/MFCC_39_16khz/dev/'
directory = '/home/jovyan/work/VCTK-Corpus/dev/'

if not os.path.exists(DIR):
    os.makedirs(DIR)
    
bar = progressbar.ProgressBar()
for wav_path in bar(find_files(directory, pattern='**/*.wav')):
    #print(wav_path)
    speaker_ID = find_speaker_ID(wav_path)
    #print(speaker_ID)
    txt_path = return_txt_path(wav_path)
    target, original = convert_txt_index(open(txt_path).read().strip())
    inputs = convert_wav_mfcc(wav_path, 16000)
    normalize_inputs = (inputs - m)/s
    pack_data_npz(DIR, normalize_inputs, target, speaker_ID, original)

100% (4312 of 4312) |#####################| Elapsed Time: 0:13:35 Time: 0:13:35


In [20]:
# Pickle Test data

DIR = '/home/jovyan/work/MFCC_39_16khz/test/'
directory = '/home/jovyan/work/VCTK-Corpus/test/'

if not os.path.exists(DIR):
    os.makedirs(DIR)
    
bar = progressbar.ProgressBar()
for wav_path in bar(find_files(directory, pattern='**/*.wav')):
    #print(wav_path)
    speaker_ID = find_speaker_ID(wav_path)
    #print(speaker_ID)
    txt_path = return_txt_path(wav_path)
    target, original = convert_txt_index(open(txt_path).read().strip())
    inputs = convert_wav_mfcc(wav_path, 16000)
    normalize_inputs = (inputs - m)/s
    pack_data_npz(DIR, normalize_inputs, target, speaker_ID, original)

100% (4283 of 4283) |#####################| Elapsed Time: 0:10:39 Time: 0:10:39
