# Word2vec code notes using FastText

General notes on how to use fastText using the python wrapper(?) found here: https://github.com/facebookresearch/fastText/tree/master/python. Command line implementation can also be used found here: https://fasttext.cc/docs/en/support.html. This version also creates a .vec file which is needed to view in tensorboard

In [3]:
import fastText as ft

### Build a model

In [16]:
model = ft.FastText.train_unsupervised('test.txt', model='skipgram')

#### Get words used in model

In [21]:
model.get_words()

['</s>',
 'the',
 'is',
 'of',
 'a',
 '{\\partial',
 'Jacobian',
 '{\\displaystyle',
 '\\varphi',
 'function',
 '\\mathbf',
 'and',
 '{f}',
 'to',
 '\\theta',
 '{J}',
 'in',
 'matrix',
 '_{\\mathbf',
 'determinant',
 'f',
 'point',
 'The',
 '{x}',
 'that',
 'at',
 'if',
 '{p}',
 'as',
 'near',
 '=',
 'then',
 'by',
 'with',
 '→',
 '2',
 'Example',
 'x}{\\partial',
 'y}{\\partial',
 'r}}&{\\dfrac',
 'an',
 'its',
 'for',
 'this',
 'be',
 'p',
 'ℝn',
 'f_{1}}{\\partial',
 'This',
 'If',
 '{F}',
 '\\sin',
 'or',
 '}(\\mathbf',
 'derivative',
 'x_{1}}}&{\\dfrac',
 'linear',
 '{R}',
 '\\mathbb',
 '\\cos',
 'x_{2}}}&{\\dfrac',
 'In',
 'are',
 'given',
 'differentiable',
 'not',
 'coordinates',
 'transformation',
 'x',
 '1',
 'inverse',
 'z}{\\partial',
 'which',
 '}}&{\\dfrac',
 'y_{1}}{\\partial',
 ':',
 '×',
 'can',
 '&r\\cos',
 'x_{3}}}\\\\[1em]{\\dfrac',
 'y_{2}}{\\partial',
 '{\\mathbf',
 'square',
 'also',
 'y_{3}}{\\partial',
 '&-r\\sin',
 'y_{4}}{\\partial',
 'where',
 'invertible',


#### Get vector representation of word - doesn't have to be in the original model because of FastTexts use of subwords

In [23]:
model.get_word_vector('king')

array([-8.9638030e-05, -7.0428365e-04, -1.1684634e-03, -1.4596151e-03,
       -1.2494313e-03,  2.4219018e-03, -2.0084984e-03, -3.2501675e-03,
        1.6478845e-05,  1.4669150e-03, -2.8544320e-03, -2.0823495e-03,
       -5.5171753e-04, -2.6905767e-03, -1.4973406e-03,  2.5300994e-03,
        5.9868972e-04,  4.9233111e-04, -3.6145106e-03, -1.6231959e-03,
        1.0269432e-03,  1.6680023e-03,  7.2455866e-04, -1.2140825e-03,
        1.2267560e-03, -1.4175571e-03,  8.7878871e-04, -7.0040917e-04,
       -2.4098430e-03, -8.7106618e-04,  3.1986047e-04,  2.9065218e-04,
        7.3973241e-04, -1.8543201e-04, -3.6539481e-04, -8.5424253e-04,
        2.8047115e-03, -1.6290577e-03, -1.4359256e-03,  1.1007186e-03,
        4.5971574e-05,  1.7826334e-03, -3.4032650e-03, -5.1183969e-04,
       -2.5152395e-04,  1.5845661e-03,  1.7632304e-03,  4.3029650e-04,
        4.5361611e-04,  2.9376587e-03, -2.4226611e-03,  5.3946197e-04,
       -3.1688255e-03,  2.1293397e-04,  2.5133782e-03,  4.4310442e-04,
      

#### Save model

In [24]:
model.save_model("test_model.bin")

#### Load model

In [25]:
model = ft.FastText.load_model('test_model.bin')

Should be same as above

In [28]:
model.get_word_vector('king')

array([-8.9638030e-05, -7.0428365e-04, -1.1684634e-03, -1.4596151e-03,
       -1.2494313e-03,  2.4219018e-03, -2.0084984e-03, -3.2501675e-03,
        1.6478845e-05,  1.4669150e-03, -2.8544320e-03, -2.0823495e-03,
       -5.5171753e-04, -2.6905767e-03, -1.4973406e-03,  2.5300994e-03,
        5.9868972e-04,  4.9233111e-04, -3.6145106e-03, -1.6231959e-03,
        1.0269432e-03,  1.6680023e-03,  7.2455866e-04, -1.2140825e-03,
        1.2267560e-03, -1.4175571e-03,  8.7878871e-04, -7.0040917e-04,
       -2.4098430e-03, -8.7106618e-04,  3.1986047e-04,  2.9065218e-04,
        7.3973241e-04, -1.8543201e-04, -3.6539481e-04, -8.5424253e-04,
        2.8047115e-03, -1.6290577e-03, -1.4359256e-03,  1.1007186e-03,
        4.5971574e-05,  1.7826334e-03, -3.4032650e-03, -5.1183969e-04,
       -2.5152395e-04,  1.5845661e-03,  1.7632304e-03,  4.3029650e-04,
        4.5361611e-04,  2.9376587e-03, -2.4226611e-03,  5.3946197e-04,
       -3.1688255e-03,  2.1293397e-04,  2.5133782e-03,  4.4310442e-04,
      

### View embeddings

In [29]:
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
import numpy as np
import os


In [37]:
# From https://gist.github.com/nlothian/0cd4540389f7091717ece6f4b89b6604

meta_file = "g2x_metadata.tsv"
output_path = "projections"
vec_file = 'model.vec'

# read embedding file into list and get the size
with open(vec_file, 'r') as embedding_file:
    embedding_content = embedding_file.readlines()
    embedding_content = [x.strip() for x in embedding_content] 


    num_lines = len(embedding_content) - 1 # skip the header
    num_dims = len(embedding_content[1].split()) - 1 # -1 because of the label column
    print("Detected dimensions:", num_lines, " X ", num_dims)

    placeholder = np.zeros((num_lines, num_dims))

    print(placeholder.shape)


    z = 0
    with open(os.path.join(output_path, meta_file), 'w') as file_metadata:

        i = 0
        for line in embedding_content[1:]:  # skip the header line
            values = line.split()
            raw_label = values[0]
            #print(label)
            col = 0
            for val in values[1:]: # skip the label
                placeholder[i][col] = val
                z = i + col
                col = col + 1
            i = i + 1

            if raw_label == '':
                file_metadata.write("<Empty Line>\n")
            else:
                label = raw_label
                file_metadata.write(label + "\n")

        print("z = ", z)

    # define the model without training
    sess = tf.InteractiveSession()

    embedding = tf.Variable(placeholder, trainable=False, name='g2x_metadata')
    tf.global_variables_initializer().run()

    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(output_path, sess.graph)

    # adding into projector
    config = projector.ProjectorConfig()
    embed = config.embeddings.add()
    embed.tensor_name = 'g2x_metadata'
    embed.metadata_path = meta_file

    # Specify the width and height of a single thumbnail.
    projector.visualize_embeddings(writer, config)
    saver.save(sess, os.path.join(output_path, 'g2x_metadata.ckpt'))
    print('Num nodes: {}'.format(num_lines))
    print('Run `tensorboard --logdir={0} --port 8088 --host 0.0.0.0` to run visualize result on tensorboard'.format(output_path))


Detected dimensions: 11  X  100
(11, 100)
z =  109
Num nodes: 11
Run `tensorboard --logdir=projections --port 8088 --host 0.0.0.0` to run visualize result on tensorboard
