In [1]:
import os, logging
import time
import random
import tensorflow as tf
import numpy as np
from datetime import datetime
from configuration import get_config
from tensorflow.contrib import rnn

config = get_config()
log_file = os.path.abspath("model-training.logs")
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')


Namespace(M=5, N=4, beta1=0.5, beta2=0.9, comment='', hidden=768, hop=0.01, iteration=100000, loss='softmax', lr=0.001, max_batch_utterances=1000, model_num=6, model_path='./tisv_model', nfft=512, noise_filenum=16, noise_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/noise', num_layer=3, optim='sgd', proj=256, restore=False, sr=8000, tdsv=False, tdsv_frame=80, test_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/test', tisv_frame=50, train=False, train_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/train', window=0.025)


In [4]:
path = "/datadrive2/dalon/diarization-experiments/Speaker_Verification/tisv-complete" # model save path
config.train_path = "/datadrive2/dalon/diarization-experiments/complete-dataset-vox12-libri-vtck-1000-utter/dataset"
config.N = 5 # Number of speakers per batch
config.M = 5 # Number of utterences per speaker
config.iteration = 50000000 # Number of iterations to run
config.lr = 1e-3
logging.info(f'N={config.N}, M={config.M}')

# Data batch

In [6]:
class GlobalVar(object):
    epoch = 0
    dataset_size = 0
    start =  0
    dataset_file_list = []

In [7]:

def random_batch(speaker_num=config.N, utter_num=config.M, shuffle=True, noise_filenum=None, utter_start=0):
    """ Generate 1 batch.
        shuffle : random sampling or not
    :return: 1 random numpy batch (frames x batch(NM) x n_mels)
    """
#     print(f'In random')
#     print(f'Epoch: {epoch}, start: {start}')
    path = config.train_path
    if GlobalVar.dataset_size == 0:
        GlobalVar.dataset_file_list = os.listdir(path)
        GlobalVar.dataset_size = len(GlobalVar.dataset_file_list)

    selected_files = GlobalVar.dataset_file_list[GlobalVar.start:GlobalVar.start+speaker_num]
    GlobalVar.start += speaker_num
    if GlobalVar.start + speaker_num >= GlobalVar.dataset_size:
        logging.info(f'Epoch {GlobalVar.epoch} completed at {str(datetime.utcnow().isoformat()[:-3])}!')
        GlobalVar.epoch += 1
        GlobalVar.start = 0
        GlobalVar.dataset_file_list = random.sample(os.listdir(path), GlobalVar.dataset_size)
#     if shuffle:
#         selected_files = random.sample(np_file_list, speaker_num)  # select random N speakers (default N=4)
#     else:
#         selected_files = np_file_list[:speaker_num]                # select first N speakers

    utter_batch = []
    for file in selected_files:
#         print(file)
        utters = np.load(os.path.join(path, file))        # load utterance spectrogram of selected speaker
        if shuffle:
            utter_index = np.random.randint(0, utters.shape[0], utter_num)   # select M utterances per speaker (default M=5)
            utter_batch.append(utters[utter_index])       # each speakers utterance [M, n_mels, frames] is appended
        else:
            utter_batch.append(utters[utter_start: utter_start+utter_num])

    utter_batch = np.concatenate(utter_batch, axis=0)     # utterance batch [batch(NM), n_mels, frames]

    # for train session, random slicing of input batch
    frame_slice = np.random.randint(config.tisv_frame-10, config.tisv_frame-1)
    utter_batch = utter_batch[:,:,:frame_slice]

    utter_batch = np.transpose(utter_batch, axes=(2,0,1))     # transpose [frames, batch, n_mels]

    return utter_batch


In [8]:
def similarity(embedded, w, b, N=config.N, M=config.M, P=config.proj, center=None):
    """ Calculate similarity matrix from embedded utterance batch (NM x embed_dim) eq. (9)
        Input center to test enrollment. (embedded for verification)
    :return: tf similarity matrix (NM x N)
    """
    embedded_split = tf.reshape(embedded, shape=[N, M, P])

    if center is None:
        center = normalize(tf.reduce_mean(embedded_split, axis=1))              # [N,P] normalized center vectors eq.(1)
        center_except = normalize(tf.reshape(tf.reduce_sum(embedded_split, axis=1, keep_dims=True)
                                             - embedded_split, shape=[N*M,P]))  # [NM,P] center vectors eq.(8)
        # make similarity matrix eq.(9)
        S = tf.concat(
            [tf.concat([tf.reduce_sum(center_except[i*M:(i+1)*M,:]*embedded_split[j,:,:], axis=1, keep_dims=True) if i==j
                        else tf.reduce_sum(center[i:(i+1),:]*embedded_split[j,:,:], axis=1, keep_dims=True) for i in range(N)],
                       axis=1) for j in range(N)], axis=0)
    else :
        # If center(enrollment) exist, use it.
        S = tf.concat(
            [tf.concat([tf.reduce_sum(center[i:(i + 1), :] * embedded_split[j, :, :], axis=1, keep_dims=True) for i
                        in range(N)],
                       axis=1) for j in range(N)], axis=0)

    S = tf.abs(w)*S+b   # rescaling

    return S

def loss_cal(S, type="softmax", N=config.N, M=config.M):
    """ calculate loss with similarity matrix(S) eq.(6) (7) 
    :type: "softmax" or "contrast"
    :return: loss
    """
    S_correct = tf.concat([S[i*M:(i+1)*M, i:(i+1)] for i in range(N)], axis=0)  # colored entries in Fig.1

    if type == "softmax":
        total = -tf.reduce_sum(S_correct-tf.log(tf.reduce_sum(tf.exp(S), axis=1, keep_dims=True) + 1e-6))
    elif type == "contrast":
        S_sig = tf.sigmoid(S)
        S_sig = tf.concat([tf.concat([0*S_sig[i*M:(i+1)*M, j:(j+1)] if i==j
                              else S_sig[i*M:(i+1)*M, j:(j+1)] for j in range(N)], axis=1)
                             for i in range(N)], axis=0)
        total = tf.reduce_sum(1-tf.sigmoid(S_correct)+tf.reduce_max(S_sig, axis=1, keep_dims=True))
    else:
        raise AssertionError("loss type should be softmax or contrast !")

    return total

def normalize(x):
    """ normalize the last dimension vector of the input matrix
    :return: normalized input
    """
    return x/tf.sqrt(tf.reduce_sum(x**2, axis=-1, keep_dims=True)+1e-6)

def optim(lr):
    """ return optimizer determined by configuration
    :return: tf optimizer
    """
    if config.optim == "sgd":
        return tf.train.GradientDescentOptimizer(lr)
    elif config.optim == "rmsprop":
        return tf.train.RMSPropOptimizer(lr)
    elif config.optim == "adam":
        return tf.train.AdamOptimizer(lr, beta1=config.beta1, beta2=config.beta2)
    else:
        raise AssertionError("Wrong optimizer type!")

# Model init done here

In [9]:


tf.reset_default_graph()    # reset graph

# draw graph
batch = tf.placeholder(shape= [None, config.N*config.M, 40], dtype=tf.float32)  # input batch (time x batch x n_mel)
lr = tf.placeholder(dtype= tf.float32)  # learning rate
global_step = tf.Variable(0, name='global_step', trainable=False)
w = tf.get_variable("w", initializer= np.array([10], dtype=np.float32))
b = tf.get_variable("b", initializer= np.array([-5], dtype=np.float32))

# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # define lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize
logging.info(f'embedded size: {embedded.shape}')

# loss
sim_matrix = similarity(embedded, w, b)
logging.info(f"similarity matrix size: {sim_matrix.shape}")
loss = loss_cal(sim_matrix, type=config.loss)

# optimizer operation
trainable_vars= tf.trainable_variables()                # get variable list
optimizer= optim(lr)                                    # get optimizer (type is determined by configuration)
grads, vars= zip(*optimizer.compute_gradients(loss))    # compute gradients of variables with respect to loss
grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)      # l2 norm clipping by 3
grads_rescale= [0.01*grad for grad in grads_clip[:2]] + grads_clip[2:]   # smaller gradient scale for w, b
train_op= optimizer.apply_gradients(zip(grads_rescale, vars), global_step= global_step)   # gradient update operation

# check variables memory
variable_count = np.sum(np.array([np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars]))
logging.info(f"total variables : {variable_count}")

# record loss
loss_summary = tf.summary.scalar("loss", loss)
merged = tf.summary.merge_all()
saver = tf.train.Saver()


Instructions for updating:
keep_dims is deprecated, use keepdims instead


# Training starts here

In [None]:
try:
    # %%time

    #___________Debug________________
    # config.iteration = 100000
    #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    logging.info(f'Training started at: {str(datetime.utcnow().isoformat()[:-3])}')
    # training session
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        os.makedirs(os.path.join(path, "Check_Point"))#, exist_ok=True)  # make folder to save model
        os.makedirs(os.path.join(path, "logs"), exist_ok=True)          # make folder to save log
        writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph)
        epoch = 0
        lr_factor = 1   # lr decay factor ( 1/2 per 10000 iteration)
        loss_acc = 0    # accumulated loss ( for running average of loss)

        for iter in range(config.iteration):
            # run forward and backward propagation and update parameters
            _, loss_cur, summary = sess.run([train_op, loss, merged],
                                  feed_dict={batch: random_batch(), lr: config.lr*lr_factor})

            loss_acc += loss_cur    # accumulated loss for each 100 iteration

            if iter % 10 == 0:
                writer.add_summary(summary, iter)   # write at tensorboard
            if (iter+1) % 100 == 0:
                logging.info("(epoch : %d) (iter : %d) loss: %.4f" % (GlobalVar.epoch, (iter+1),loss_acc/100))
                loss_acc = 0                        # reset accumulated loss
            if (iter+1) % 10000 == 0: # decay at 10k
                if config.lr*(lr_factor / 2) < 1e-4:
                    logging.info("learning rate not decaying : ", config.lr*lr_factor)
                else:
                    lr_factor /= 2                      # lr decay
                    logging.info("learning rate is decayed! current lr : ", config.lr*lr_factor)
            if (iter+1) % 10000 == 0:
                saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter//10000, max_to_keep=None)
                logging.info("model is saved!")
    logging.info(f'Training ended at: {str(datetime.utcnow().isoformat()[:-3])}')
except Exception as e:
    logging.exception(e)