In [1]:
import numpy as np
import math
import scipy.stats as stats
from scipy.io import wavfile
import time

In [2]:
"""Real audio data"""

wav_file = "../data/wav48/p225/p225_001.wav"

# sample frequency: sample n frames in 1 second
sampFreq ,snd = wavfile.read(wav_file)
frame_n = len(snd)
snd_len = 1. * frame_n / sampFreq

timeArray = np.arange(0, frame_n, 1)
timeArray = 1. * timeArray / sampFreq
timeArray = timeArray * 1000  #scale to milliseconds
print sampFreq

48000


In [5]:
"""Simulated data"""

# Durations 
phoneme_n = 10
d_lower, d_upper = 0., 100.  #
d_mu, d_sigma = 40., 20.
samples = stats.truncnorm.rvs(
          (d_lower-d_mu)/d_sigma,(d_upper-d_mu)/d_sigma,loc=d_mu,scale=d_sigma,size=phoneme_n)
norm = 1. * samples / sum(samples)


duration_per_phoneme = np.array([norm * snd_len]*2) # the time that each phoneme lasts
frames_per_phoneme = np.array([[int(n) for n in norm * frame_n]]*2) # the frames that each phoneme contains


In [6]:
"""Parameters"""

# The params for bucket assignment
# Each frame is ensured to be 10 milliseconds

num_buckets = 256
min_frame_len = 10
asn_upper = np.log(.95 * d_upper)
asn_lower = np.log(min_frame_len/1000.)
inc = (asn_upper - asn_lower)/(num_buckets-2)

# Wae file params
#samp_freq = 10

# The params for voiced tag
voiced_thresh = 1000.

In [None]:
"""Simulating Functions"""
def generate_phonemes(max_len, vocab_size):
    phoneme_n = np.random.randint(max_len)
    phonemes = np.random.randint(vocab_size,size=phoneme_n)
    zeros = np.zeros((max_len-phoneme_n), dtype=np.int)
    phonemes = np.concatenate((phonemes,zeros))
    return phonemes, phoneme_n

def generate_durations(max_len,phoneme_n, d_lower, d_upper, d_mu, d_sigma):
    duration_per_phoneme = stats.truncnorm.rvs(
              (d_lower-d_mu)/d_sigma,(d_upper-d_mu)/d_sigma,loc=d_mu,scale=d_sigma,size=phoneme_n)
    zeros = np.zeros((max_len-phoneme_n))
    duration_per_phoneme = np.concatenate((duration_per_phoneme,zeros))
    return duration_per_phoneme



def create_sentence(max_len,input_vocab_size,d_lower, d_upper, d_mu, d_sigma, asn_upper, asn_lower, num_buckets, batch_size):
    # speaker id = 1
    phonemes, phoneme_n = generate_phonemes(max_len,input_vocab_size)
    duration_per_phoneme = generate_durations(max_len,phoneme_n, d_lower, d_upper, d_mu, d_sigma)
    duration_per_phoneme = assign_bucket([duration_per_phoneme]*batch_size,, asn_upper, asn_lower, num_buckets)
    sentence_dict = {'phonemes':[phonemes]*batch_size,
                     'phonemes_seq_len': [phoneme_n]*batch_size,, 
                     'speaker_ids': 1 * np.ones((batch_size)), 
                     'durations': duration_per_phoneme
                     }
    return sentence_dict

In [44]:
"""Preprocessing Functions"""
def generate_phonemes1(max_len, vocab_size):
    #phoneme_n = np.random.randint(max_len)
    phoneme_n = 10
    phonemes = np.random.randint(vocab_size,size=phoneme_n)
    #zeros = np.zeros((max_len-phoneme_n), dtype=np.int)
    #phonemes = np.concatenate((phonemes,zeros))
    return phonemes, phoneme_n

def assign_bucket(durations, asn_upper, asn_lower, num_buckets):
    # Assign durations into buckets. Duration shape should be (batch_n, sentence_len).     
    inc = (asn_upper - asn_lower)/(num_buckets-2)
    def assign(duration):
        log_duration = np.log(duration)
        if log_duration < asn_lower:
            return 0
        elif log_duration > asn_upper:
            return num_buckets
        else:
            return int(math.ceil((log_duration-asn_lower)/inc))
    bucket_durations = [[assign(d) for d in sentence] for sentence in durations]
    return bucket_durations

def get_durations(bucket_durations):
    # Calculate the duration by the bucket
    def get(bucket):
        log_durations = asn_lower + bucket * inc
        return np.e**log_durations
    durations = [[get(d) for d in sentence] for sentence in bucket_durations]
    return durations


def tag_target_voiced(snd):
    # Check whether each frame is voiced
    # Arg durations should be real time durations
    voiced = [1 if frame>voiced_thresh else 0 for frame in snd]
    
    return voiced


def phonemes_to_frames(phonemes,frames_per_phoneme):
    # Upsample the phonemes to frames
    # frames_per_phoneme is given when trianing, and is infered during prediction
    frames = []
    zipped = np.dstack((phonemes,frames_per_phoneme))
    for sentence in zipped:
        for (phoneme,frames_n) in sentence:
            frames.extend(np.ones(frames_n,dtype=np.int)*phoneme)
    #frames = [np.ones(frames_n,dtype=np.int)*phoneme for (phoneme,frames_n) in sentence] for sentence in zipped]
    return frames


"""
def tag_target_voiced(snd, frames_per_phoneme):
    # Check whether each frame is voiced
    # Arg durations should be real time durations
    snd = np.array(snd)
    start_f = 0
    for f_n in frames_per_phoneme:
        end_f = start_f + f_n
        phoneme = snd[start_f:end_f]
        if max(phoneme) >= voiced_thresh:
            snd[start_f:end_f] = 1
        else:
            snd[start_f:end_f] = 0
        start_f = end_f
    return snd
"""

'\ndef tag_target_voiced(snd, frames_per_phoneme):\n    # Check whether each frame is voiced\n    # Arg durations should be real time durations\n    snd = np.array(snd)\n    start_f = 0\n    for f_n in frames_per_phoneme:\n        end_f = start_f + f_n\n        phoneme = snd[start_f:end_f]\n        if max(phoneme) >= voiced_thresh:\n            snd[start_f:end_f] = 1\n        else:\n            snd[start_f:end_f] = 0\n        start_f = end_f\n    return snd\n'

In [45]:
phonemes,_ = generate_phonemes(200, 50)
frames = np.array(phonemes_to_frames([list(phonemes)]*2,frames_per_phoneme))
print frames


[44 44 44 ...,  4  4  4]


[array([ 0.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.])]
