In [12]:
import sys
import numpy as np
from python_speech_features import mfcc
from python_speech_features import logfbank
import python_speech_features
import scipy.io.wavfile as wav
import os
import re

In [13]:
phone_classes = np.array(['b', 'd', 'g', 'p', 't', 'k', 'dx', 'q', 
                 'jh', 'ch', 's', 'sh', 'z', 'zh', 'f', 'th', 'v', 'dh',
                 'm', 'n', 'ng', 'em', 'en', 'eng', 'nx',
                 'l', 'r', 'w', 'y', 'hh', 'hv', 'el',
                 'iy', 'ih', 'eh', 'ey', 'ae', 'aa', 'aw', 
                 'ay', 'ah', 'ao', 'oy', 'ow', 'uh', 'uw',
                 'ux', 'er', 'ax', 'ix', 'axr', 'ax-h',
                 'pau', 'epi',  
                 'bcl', 'dcl', 'gcl', 'pcl', 'tck', 'kcl', 'tcl',
                 'h', '1', '2'])
print len(phone_classes)

# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space

64


In [21]:
def normalize_mfcc(mfcc):
    means = np.mean(mfcc, 0)
    stds = np.std(mfcc, 0)
    return (mfcc - means)/stds

def add_noise(v):
    n = np.random.normal(scale=0.6, size=v.size)
    v = v + np.reshape(n, v.shape)
    return v

def delta(feat, N):
    """Compute delta features from a feature vector sequence.
    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
    :param N: For each frame, calculate delta features based on preceding and following N frames
    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
    """
    NUMFRAMES = len(feat)
    feat = np.concatenate(([feat[0] for i in range(N)], feat, [feat[-1] for i in range(N)]))
    denom = sum([2*i*i for i in range(1,N+1)])
    dfeat = []
    for j in range(NUMFRAMES):
        dfeat.append(np.sum([n*feat[N+j+n] for n in range(-1*N,N+1)], axis=0)/denom)
    return np.asarray(dfeat)

def convert_mfcc(filename):
    (rate,sig) = wav.read(filename)
    mfcc_feat = mfcc(sig, samplerate=rate)
    
    d = logfbank(sig)
    mfcc_feat = np.concatenate((mfcc_feat, d), axis=1)
    mfcc_feat = normalize_mfcc(mfcc_feat)
    
    mfcc_feat = add_noise(mfcc_feat)
    
#     print mfcc_feat.shape
    assert mfcc_feat.shape[1] == 39
    return mfcc_feat

def read_phn(f):
    temp_phones = np.loadtxt(f, dtype={'names':('start', 'end', 'phone'), 
                            'formats':(np.int32, np.int32, 'S4')})
    
    # Get the length of the phone data
    _, phn_len, _ = temp_phones[-1]    
    phn_len_mill = int(phn_len/160)
  

    # Create an array to store the start and end times of a phone
    phone_times = []
    
    # Convert the string phonemes to class labels
    for i, (s, e, phone) in enumerate(temp_phones):
        a = np.where(phone_classes == phone)[0][0]
        if a <= 60:
            phone_times.append(a)
    phone_times = np.asarray(phone_times)
    phone_times = phone_times.astype(np.uint8)
    return phone_times

def read_txt(target_filename):
    # Readings targets
    with open(target_filename, 'r') as f:
        for line in f.readlines():
            if line[0] == ';':
                continue
            # Get only the words between [a-z] and replace period for none
#             original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '', '\'')
            
            original = " ".join(re.findall("[a-z]+", line.strip().lower()))
#             print original
            targets = original.replace(' ', '  ')
            targets = targets.split(' ')

    # Adding blank label
#     print targets
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])
#     print targets
    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])
    targets = np.maximum(targets, 0)
    return targets

In [22]:
train_root = "/home/zhihaol/TIMIT/TRAIN/"
test_root = "/home/zhihaol/TIMIT/TEST/"
train_target = "/home/zhihaol/807/TIMIT/train/"
test_target = "/home/zhihaol/807/TIMIT/test/"

In [23]:
root = train_root
target = train_target
for subdir, dirs, files in os.walk(root):
    for f in files:
        if f.endswith(".PHN"):
            abs_fn = os.path.join(subdir, f)
            print abs_fn
            mfcc_feat = convert_mfcc(abs_fn[:-3] + "wav")
#             phone_times = read_phn(abs_fn)
            txt_target = read_txt(abs_fn[:-3] + "TXT")
            dds = abs_fn.split('/')
            new_name = dds[-3] + '_' + dds[-2] + '_' + dds[-1]
            new_name = new_name[:-4]
            np.save(os.path.join(target, 'mfcc' ,new_name), mfcc_feat.transpose())
#             np.save(os.path.join(target, 'phn' ,new_name), phone_times)
            np.save(os.path.join(target, 'txt' ,new_name), txt_target)
            break

/home/zhihaol/TIMIT/TRAIN/DR6/MBMA1/SI2207.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MRMB0/SX51.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FRJB0/SA2.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MRXB0/SA1.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MTXS0/SI1690.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MESJ0/SA1.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MSDS0/SX357.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FSBK0/SX79.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MSMR0/SX325.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FHXS0/SX355.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MKLN0/SI2228.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MAJP0/SX354.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FLAG0/SX294.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MDRD0/SX32.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FPAD0/SX446.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MSAT1/SX353.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FSGF0/SI1557.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FTAJ0/SX249.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MSVS0/SX308.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FJDM2/SI2212.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/MMDB0/SX357.PHN
/home/zhihaol/TIMIT/TRAIN/DR6/FAPB0/SX

In [24]:
abs_fn = "/home/zhihaol/TIMIT/TRAIN/DR2/FDNC0/SA1.wav"
# abs_fn = a
target = train_target
print abs_fn
mfcc_feat = convert_mfcc(abs_fn[:-3] + "wav")
txt_target = read_txt(abs_fn[:-3] + "TXT")
dds = abs_fn.split('/')
new_name = dds[-3] + '_' + dds[-2] + '_' + dds[-1]
new_name = new_name[:-4]
np.save(os.path.join(target, 'mfcc' ,new_name), mfcc_feat.transpose())
#             np.save(os.path.join(target, 'phn' ,new_name), phone_times)
np.save(os.path.join(target, 'txt' ,new_name), txt_target)

/home/zhihaol/TIMIT/TRAIN/DR2/FDNC0/SA1.wav


In [27]:
a = [[1,2,3],[4,5,6],[7,8,9]]
a = np.asarray(a)
print np.mean(a, 0)

[ 4.  5.  6.]
