In [4]:
import os
import math
import glob
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [11]:
import tools.processing as pre

batch_size = 256
embedding_dimension = 3
negative_samples = 32
LOG_DIR = "logs/phone2vec-test"
EPOCHS = 5

text = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")
sentences = [text.replace( "\n", ";" )]

vocab = pre.Vocabulary(sentences[0])

# Map words to indices
index2word = vocab.index2word
word2index = vocab.word2index

vocabulary_size = len(index2word)
print("vocab_size: {} \n".format(vocabulary_size))

vocab_size: 85 



## Generate skip-gram pairs for all phonetics
### We have chosen a window size of 1 in order to not get topical meanings <br/> So each target word gets combined with its two context words as a tupel

In [6]:
# Generate skip-gram pairs
skip_gram_pairs = []
for sent in sentences:
    tokenized_sent = sent.split()
    for i in range(1, len(tokenized_sent) - 1):
        word_context_pair = [[word2index[tokenized_sent[i-1]],
                              word2index[tokenized_sent[i+1]]],
                             word2index[tokenized_sent[i]]]
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][0]])
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][1]])

def get_skipgram_batch(batch_size):
    instance_indices = list(range(len(skip_gram_pairs)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [skip_gram_pairs[i][0] for i in batch]
    y = [[skip_gram_pairs[i][1]] for i in batch]
    return x, y

## Building the model

### We need to reset the default graph every time we want to retrain our model

In [None]:
# start tensorflow
tf.reset_default_graph()

In [8]:
# Input data, labels
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

# Embedding lookup table currently only implemented in CPU
with tf.name_scope("embeddings"):
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_dimension],
                          -1.0, 1.0), name='embedding')
    # This is essentially a lookup table
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

# Create variables for the NCE loss
nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_dimension],
                            stddev=1.0 / math.sqrt(embedding_dimension)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))


loss = tf.reduce_mean(
  tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, inputs=embed, labels=train_labels,
                 num_sampled=negative_samples, num_classes=vocabulary_size))
tf.summary.scalar("NCE_loss", loss)

# Learning rate decay
global_step = tf.Variable(0, trainable=False)
learningRate = tf.train.exponential_decay(learning_rate=0.1,
                                          global_step=global_step,
                                          decay_steps=1000,
                                          decay_rate=0.95,
                                          staircase=True)
train_step = tf.train.GradientDescentOptimizer(learningRate).minimize(loss)

merged = tf.summary.merge_all()

saver = tf.train.Saver(keep_checkpoint_every_n_hours=0.5)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


## Starting the training

### In order to visualize our output we need to create the following: <br/> <ul><li>metadata</li><li>projectorConfiguration</li></ul>
#### We are saving our model into two model checkpoints. The first "embedding"-checkpoint is just for the embeddings, in case one would forcefully stop the process. <br/> The second "final-embedding"-checkpoint is for saving the complete model after all epochs have finished

In [13]:
TRAIN = True
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    train_writer = tf.summary.FileWriter(LOG_DIR, graph=tf.get_default_graph())
    
    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR, exist_ok=True)
            
    with open(os.path.join(LOG_DIR, 'metadata.tsv'), "w") as metadata:
        metadata.write('Name\tClass\n')
        for k, v in index2word.items():
            metadata.write('%s\t%d\n' % (v, k))

    if glob.glob(LOG_DIR + '/*.meta'):
        TRAIN = True # set this value to false, if you don't want to retrain the model
        saver = tf.train.import_meta_graph(glob.glob(LOG_DIR + '/*.meta')[0])
        saver.restore(sess, os.path.join(LOG_DIR, "final_embeddings.ckpt"))
        # global_step = sess.run(global_step)
        print("Restoring an old model and training it further..")
    else:
        print("Building model from scratch!")
        # global_step = 0

    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embeddings.name
    # Link this tensor to its metadata file (e.g. labels).
    embedding.metadata_path = 'metadata.tsv'
    projector.visualize_embeddings(train_writer, config)

    if TRAIN:
        for epoch in range(EPOCHS):
            print(f"\n\nepoch: {epoch}\n")
            
            # epoch_steps = (int(len(skip_gram_pairs)/batch_size))
            epoch_steps = 1000
            for step in range(epoch_steps):
                x_batch, y_batch = get_skipgram_batch(batch_size)
                summary, _ = sess.run([merged, train_step],
                                    feed_dict={train_inputs: x_batch,
                                                train_labels: y_batch})
                # TODO we would need global_step here in order to get a nice diagram
                #      Now every time we start with counting from zero
                train_writer.add_summary(summary, step)
                
                if step % 100 == 0:
                    loss_value = sess.run(loss,
                                            feed_dict={train_inputs: x_batch,
                                                        train_labels: y_batch})
                    print("Loss at %d/%d: %.5f" % (step, epoch_steps, loss_value))


            saver.save(sess, os.path.join(LOG_DIR, "embeddings.ckpt"), epoch)
                
    saver.save(sess, os.path.join(LOG_DIR, "final_embeddings.ckpt"))

    # Normalize embeddings before using
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    normalized_embeddings_matrix = sess.run(normalized_embeddings)

Building model from scratch!


epoch: 0

Loss at 0/1000: 41.19302
Loss at 100/1000: 10.40371
Loss at 200/1000: 4.51695
Loss at 300/1000: 3.83374
Loss at 400/1000: 3.72875
Loss at 500/1000: 3.82324
Loss at 600/1000: 4.29863
Loss at 700/1000: 3.78456
Loss at 800/1000: 3.39517
Loss at 900/1000: 3.93464


epoch: 1

Loss at 0/1000: 4.12607
Loss at 100/1000: 3.55881
Loss at 200/1000: 3.67067
Loss at 300/1000: 3.41221
Loss at 400/1000: 3.25622
Loss at 500/1000: 3.28257
Loss at 600/1000: 3.39348
Loss at 700/1000: 3.27252
Loss at 800/1000: 3.21166
Loss at 900/1000: 3.63068


epoch: 2

Loss at 0/1000: 3.74564
Loss at 100/1000: 3.19109
Loss at 200/1000: 3.24582
Loss at 300/1000: 3.28715
Loss at 400/1000: 3.30601
Loss at 500/1000: 2.87213
Loss at 600/1000: 3.46432
Loss at 700/1000: 3.20118
Loss at 800/1000: 3.81336
Loss at 900/1000: 3.28137


epoch: 3

Loss at 0/1000: 3.30725
Loss at 100/1000: 3.55253
Loss at 200/1000: 3.44056
Loss at 300/1000: 3.39185
Loss at 400/1000: 3.30659
Loss at 500/1000: 3

## Test the similarity of a given word by its cosine distance
### The 10 next similar words are displayed

In [17]:
ref_word = normalized_embeddings_matrix[word2index["AY0"]]

cosine_dists = np.dot(normalized_embeddings_matrix, ref_word)
ff = np.argsort(cosine_dists)[::-1][0:10]
for f in ff:
    print(index2word[f], "\t", cosine_dists[f])

AY0 	 0.99999994
UW 	 0.9812705
AO2 	 0.9131582
EH 	 0.7681473
EY 	 0.7136153
N 	 0.713245
NG 	 0.6829777
G 	 0.672083
ER2 	 0.6545277
K 	 0.6018664


## At last the encoded embedding matrix is going to be saved as a text-file 

In [19]:
# saving embedding matrix to file
with open(os.path.join(LOG_DIR, "embedding.txt"), 'w') as f:
    for i in range(vocabulary_size):
      embed = normalized_embeddings_matrix[i, :]
      word = index2word[i]
      f.write('%s %s\n' % (word, ' '.join(map(str, embed))))