In [96]:
import numpy as np
import tensorflow as tf
import scipy.io
from sklearn import preprocessing
from sklearn import cross_validation

In [37]:
fileidxJW11 = scipy.io.loadmat("XRMB/DATA/FILEIDX/fileidxJW11.mat")
fileidxJW13 = scipy.io.loadmat("XRMB/DATA/FILEIDX/fileidxJW13.mat")
fileidxJW24 = scipy.io.loadmat("XRMB/DATA/FILEIDX/fileidxJW24.mat")
fileidxJW30 = scipy.io.loadmat("XRMB/DATA/FILEIDX/fileidxJW30.mat")

JW11 = scipy.io.loadmat("XRMB/DATA/MAT/JW11[numfr1=7,numfr2=7].mat")
JW13 = scipy.io.loadmat("XRMB/DATA/MAT/JW13[numfr1=7,numfr2=7].mat")
JW24 = scipy.io.loadmat("XRMB/DATA/MAT/JW24[numfr1=7,numfr2=7].mat")
JW30 = scipy.io.loadmat("XRMB/DATA/MAT/JW30[numfr1=7,numfr2=7].mat")

In [195]:
print JW11.keys()

scaler = preprocessing.StandardScaler()
mfcc_features = scaler.fit_transform(preprocessing.normalize(np.transpose(JW11['MFCC'])))
articulatory_features = scaler.fit_transform(preprocessing.normalize(np.transpose(JW11['X']).astype(float)))
phone_labels = np.transpose(JW11['P'][0])

lb = preprocessing.LabelBinarizer()
lb.fit(phone_labels)
binarized_labels = lb.transform(phone_labels)

n_samples = mfcc_features.shape[0]
n_mfcc_features = mfcc_features.shape[1]
n_articulatory_features = articulatory_features.shape[1]

permutation = np.random.permutation(n_samples)
X1 = np.asarray([mfcc_features[i] for i in permutation])
X2 = np.asarray([articulatory_features[i] for i in permutation])
Y = np.asarray([binarized_labels[i] for i in permutation])

train, dev, test = 15948, 25948, 40948 #use 25948, 40948, 50948

X1_tr = X1[1:train, :]
X1_dev = X1[train+1:dev, :]
X1_test = X1[dev+1:test, :]

X2_tr = X2[1:train, :]

Y_tr = Y[1:train, :]
Y_dev = Y[train+1:dev, :]
Y_test = Y[dev+1:test, :]

baseline_acoustic_tr = X1_tr[:, 118:156]
baseline_acoustic_dev = X1_dev[:, 118:156]
baseline_acoustic_test = X1_test[:, 118:156]
    
print X1_tr.shape
print X2_tr.shape
print Y_tr.shape
print baseline_acoustic_tr.shape

['bigindices', 'Valid_Files', 'indices', 'MFCC', 'Phones', 'bigP', '__header__', '__globals__', 'P', 'bigPhones', 'Words', 'frame_locs', 'Frames', 'X', '__version__', 'SpeakerID', 'Times']
(15947, 273)
(15947, 112)
(15947, 39)
(15947, 38)


In [None]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

def model(X, w):
    return tf.matmul(X, w)

X = tf.placeholder("float", [None, 273])
Y = tf.placeholder("float", [None, 39])

w = init_weights([273, 39])

py_x = model(X, w)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
predict_op = tf.argmax(py_x, 1)

sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

for i in range(100):
    for start, end in zip(range(0, len(X1_tr), 100), range(n_mfcc_features, len(X1_tr), 100)):
        sess.run(train_op, feed_dict = {X: X1_tr[start:end], Y: Y_tr[start:end]})
    print i, np.mean(np.argmax(Y_dev, axis=1) == sess.run(predict_op, feed_dict = {X: X1_dev, Y: Y_dev}))

In [205]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

def lr_model(X, baseline, w_h, w_o):
    final = tf.concat(1, [X, baseline])
    h = tf.nn.sigmoid(tf.matmul(final, w_h))
    return tf.matmul(h, w_o)

def encoder(X, enc1_w, enc1_b, enc2_w, enc2_b, enc3_w, enc3_b, enc4_w, enc4_b, p_keep_input, p_keep_hidden):
    X = tf.nn.dropout(X, p_keep_input)
    h1 = tf.nn.relu(tf.matmul(X, enc1_w) + enc1_b)
    
    h1 = tf.nn.dropout(h1, p_keep_hidden)
    h2 = tf.nn.relu(tf.matmul(h1, enc2_w) + enc2_b)
    
    h2 = tf.nn.dropout(h2, p_keep_hidden)
    h3 = tf.nn.relu(tf.matmul(h2, enc3_w) + enc3_b)
    
    h3 = tf.nn.dropout(h3, p_keep_hidden)
    return tf.matmul(h3, enc4_w) + enc4_b
    
def decoder_X1(h4, dec1_w, dec1_b, dec2_w, dec2_b, dec3_w, dec3_b, dec4_w, dec4_b, p_keep_hidden):
    h4 = tf.nn.dropout(h4, p_keep_hidden)
    h3 = tf.nn.relu(tf.matmul(h4, dec4_w) + dec4_b)
    
    h3 = tf.nn.dropout(h3, p_keep_hidden)
    h2 = tf.nn.relu(tf.matmul(h3, dec3_w) + dec3_b)
    
    h2 = tf.nn.dropout(h2, p_keep_hidden)
    h1 = tf.nn.relu(tf.matmul(h2, dec2_w) + dec2_b)
    
    h1 = tf.nn.dropout(h1, p_keep_hidden)
    return tf.matmul(h1, dec1_w) + dec1_b

def decoder_X2(h4, dec4_w, dec4_b, p_keep_hidden):
    h4 = tf.nn.dropout(h4, p_keep_hidden)
    return tf.matmul(h4, dec4_w) + dec4_b

# def decoder_X2(h4, dec1_w, dec1_b, dec2_w, dec2_b, dec3_w, dec3_b, dec4_w, dec4_b, p_keep_hidden):
#     h4 = tf.nn.dropout(h4, p_keep_hidden)
#     h3 = tf.nn.sigmoid(tf.matmul(h4, dec4_w) + dec4_b)
    
#     h3 = tf.nn.dropout(h3, p_keep_hidden)
#     h2 = tf.nn.sigmoid(tf.matmul(h3, dec3_w) + dec3_b)
    
#     h2 = tf.nn.dropout(h2, p_keep_hidden)
#     h1 = tf.nn.sigmoid(tf.matmul(h2, dec2_w) + dec2_b)
    
#     h1 = tf.nn.dropout(h1, p_keep_hidden)
#     return tf.nn.sigmoid(tf.matmul(h1, dec1_w) + dec1_b)

X1 = tf.placeholder("float", [None, 273])
X2 = tf.placeholder("float", [None, 112])
Y = tf.placeholder("float", [None, 39])

enc1_w = init_weights([273, 300])
enc2_w = init_weights([300, 150])
enc3_w = init_weights([150, 100])
enc4_w = init_weights([100, 50])

enc1_b = init_weights([1, 300])
enc2_b = init_weights([1, 150])
enc3_b = init_weights([1, 100])
enc4_b = init_weights([1, 50])

dec1_w_x1 = init_weights([300, 273])
dec2_w_x1 = init_weights([150, 300])
dec3_w_x1 = init_weights([100, 150])
dec4_w_x1 = init_weights([50, 100])

# dec1_w_x2 = init_weights([90, 112])
# dec2_w_x2 = init_weights([65, 90])
# dec3_w_x2 = init_weights([65, 65])
dec4_w_x2 = init_weights([50, 112])

dec1_b_x1 = init_weights([1, 273])
dec2_b_x1 = init_weights([1, 300])
dec3_b_x1 = init_weights([1, 150])
dec4_b_x1 = init_weights([1, 100])

# dec1_b_x2 = init_weights([1, 112])
# dec2_b_x2 = init_weights([1, 90])
# dec3_b_x2 = init_weights([1, 65])
dec4_b_x2 = init_weights([1, 112])

lr_w = init_weights([88, 50])
lr_w_o = init_weights([50, 39])
# lr_b = init_weights([1, 50])
# lr_b_o = init_weights([1, 39])

p_keep_input = tf.placeholder("float")
p_keep_hidden = tf.placeholder("float")

construction_x1 = encoder(X1, enc1_w, enc1_b, enc2_w, enc2_b, enc3_w, enc3_b, enc4_w, enc4_b, p_keep_input, p_keep_hidden)
reconstruction_x1 = decoder_X1(construction_x1, dec1_w_x1, dec1_b_x1, dec2_w_x1, dec2_b_x1, dec3_w_x1, dec3_b_x1, dec4_w_x1, dec4_b_x1, p_keep_hidden)
# reconstruction_x2 = decoder_X2(construction_x1, dec1_w_x2, dec1_b_x2, dec2_w_x2, dec2_b_x2, dec3_w_x2, dec3_b_x2, dec4_w_x2, dec4_b_x2, p_keep_hidden)
reconstruction_x2 = decoder_X2(construction_x1, dec4_w_x2, dec4_b_x2, p_keep_hidden)

ae_cost = tf.nn.l2_loss(reconstruction_x1 - X1) + tf.nn.l2_loss(reconstruction_x2 - X2)
ae_train_op = tf.train.RMSPropOptimizer(1e-4, 0.9).minimize(ae_cost)
#ae_train_op = tf.train.AdamOptimizer(0.001).minimize(ae_cost)

baseline = tf.placeholder("float", [None, 38])
py_x = lr_model(construction_x1, baseline, lr_w, lr_w_o)
lr_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
lr_train_op = tf.train.AdamOptimizer(0.001).minimize(lr_cost)

predict_op = tf.argmax(py_x, 1)

sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

num_epochs = 500

print "autoencoder training: "
for i in range(num_epochs):
    for start, end in zip(range(0, len(X1_tr), 100), range(100, len(X1_tr), 100)):
        sess.run(ae_train_op, feed_dict = {X1: X1_tr[start:end], X2: X2_tr[start:end], Y: Y_tr[start:end], p_keep_input: 0.8, p_keep_hidden: 0.5})
    #print "Epoch ", i, ". Cost = ", sess.run(ae_cost, feed_dict = {X1: X1_tr, X2: X2_tr, Y: Y_tr, p_keep_input: 1.0, p_keep_hidden: 1.0})
    #print "" + str(i) + ", ",
print "done training autoencoder"
        
print "logistic regression training: "
for i in range(500):
    for start, end in zip(range(0, len(X1_tr), 100), range(100, len(X1_tr), 100)):
        sess.run(lr_train_op, feed_dict = {X1: X1_tr[start:end], Y: Y_tr[start:end], baseline: baseline_acoustic_tr[start:end], p_keep_input: 1.0, p_keep_hidden: 1.0})
    #print "Epoch ", i, ". cost = ", sess.run(lr_cost, feed_dict = {X1: X1_tr[start:end], Y: Y_tr[start:end], baseline: baseline_acoustic_tr[start:end], p_keep_input: 1.0, p_keep_hidden: 1.0})
    print i, np.mean(np.argmax(Y_dev, axis=1) == sess.run(predict_op, feed_dict = {X1: X1_dev, Y: Y_dev, baseline: baseline_acoustic_dev, p_keep_input: 1.0, p_keep_hidden: 1.0}))

 autoencoder training: 
done training autoencoder
logistic regression training: 
0 0.268126812681
1 0.343834383438
2 0.425442544254
3 0.46904690469
4 0.500050005001
5 0.5199519952
6 0.539453945395
7 0.552055205521
8 0.57095709571
9 0.579657965797
10 0.603860386039
11 0.614561456146
12 0.624862486249
13 0.640164016402
14 0.637063706371
15 0.641664166417
16 0.641864186419
17 0.663666366637
18 0.661666166617
19 0.670067006701
20 0.67596759676
21 0.686668666867
22 0.687068706871
23 0.687268726873
24 0.688768876888
25 0.686668666867
26 0.705370537054
27 0.699669966997
28 0.693069306931
29 0.694769476948
30 0.700670067007
31 0.69496949695
32 0.706170617062
33 0.703870387039
34 0.705870587059
35 0.708770877088
36 0.711771177118
37 0.704370437044
38 0.707170717072
39 0.708370837084
40 0.712871287129
41 0.701270127013
42 0.708470847085
43 0.707170717072
44 0.704170417042
45 0.716471647165
46 0.712571257126
47 0.719671967197
48 0.701270127013
49 0.70797079708
50 0.725472547255
51 0.715171517152
