In [8]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
from tensorflow.contrib.tensorboard.plugins import projector

import numpy as np
import tensorflow as tf

Train a word embedding from scratch

In [2]:
with open('./Data/cornell movie-dialogs corpus/movie_lines.txt',encoding='utf-8', errors ='ignore') as file:
    data = file.readlines()  

In [4]:
data_array = []
for line in data :
    split_string = line.split('+++$+++')
    dict_values = {'movieID':split_string[2], 'character name':split_string[3], 'utterance': split_string[4]}
    
    #data_array.append(dict_values)
    data_array.append(dict_values['utterance'][1:-1])



In [9]:

def retrieve_data(data_array):
    for utterance in data_array:
        #apply some tokenization of each utterance
        yield gensim.utils.simple_preprocess(utterance)

In [10]:
utterances = list(retrieve_data(data_array))

In [11]:
model = Word2Vec(utterances, size=300, window=10, min_count=2, workers=10, iter= 10)

In [12]:
len(list(model.wv.vocab.keys()))

30326

train the model at this point

In [13]:
model.train(utterances, total_examples=len(utterances), epochs=10)

(22218640, 30251270)

lets now the check the similarity measure

In [24]:
model.wv.most_similar(positive='good',topn=10)

[('bad', 0.5045223236083984),
 ('tough', 0.4259261190891266),
 ('beginner', 0.3719980716705322),
 ('quick', 0.3636906147003174),
 ('rough', 0.362427681684494),
 ('chipper', 0.3558504581451416),
 ('smart', 0.3500877022743225),
 ('nice', 0.34913361072540283),
 ('lucky', 0.34018972516059875),
 ('clever', 0.3382002115249634)]

Train a word embedding from a pretrained word embedding such as google

In [22]:
#The reason for separating the trained vectors into KeyedVectors is that if you don’t need the full model state any more (don’t need to continue training)
googl_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [106]:
model3 = Word2Vec(utterances,size=300,window=10, min_count= 2, iter=10)

In [107]:
len(list(model3.wv.vocab.keys()))

30326

In [111]:
total_examples = model3.corpus_count
google_vocab = list(googl_model.wv.vocab.keys())

  from ipykernel import kernelapp as app


In [110]:
model3.build_vocab([google_vocab], update=True)

In [112]:
len(list(model3.wv.vocab.keys()))

30326

In [114]:
model3.intersect_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True, lockf=1.0 )

In [128]:
model3.train(utterances,total_examples=len(utterances), epochs = model3.epochs)

(22218367, 30251270)

In [129]:
model3.save("wordvec_from_pretrained_model")

In [130]:
model3.wv.save("wordvec_from_pretrained_model_vectors")

In [148]:

max_size = len(model3.wv.vocab)

In [149]:
w2v = np.zeros((max_size,model3.trainables.layer1_size))

In [153]:
with open("metadata.tsv",'w+') as file_metadata:
    for i,word in enumerate(model3.wv.index2word[:]):
        w2v[i] = model3.wv[word]
        file_metadata.write(word + '\n')

In [154]:
w2v

array([[ 4.65494841e-01,  7.94879913e-01,  5.16691506e-01, ...,
         3.32302094e-04,  4.19694543e-01,  6.39392316e-01],
       [-3.50675792e-01,  5.46539962e-01,  1.80310094e+00, ...,
         1.13130677e+00,  7.12191686e-02, -6.13567650e-01],
       [-4.86572176e-01,  1.24483027e-01,  1.95161760e+00, ...,
        -7.97282100e-01, -2.88278282e-01, -4.87563998e-01],
       ...,
       [-8.84071067e-02,  9.88474488e-03, -6.01203367e-03, ...,
        -5.69259096e-03,  2.21646484e-02, -1.79266959e-01],
       [ 7.37095550e-02, -7.65446872e-02, -3.03474814e-01, ...,
        -9.06900782e-03, -5.76125570e-02,  3.81703228e-01],
       [ 1.62402451e-01, -8.63411725e-02, -5.71385771e-03, ...,
         5.73954545e-02,  9.82717350e-02, -1.89005882e-01]])

In [162]:
sess = tf.InteractiveSession()

In [163]:
#Create a 2d tensor which holds the embdeings

with tf.device("/cpu:0"):
    embedding = tf.Variable(w2v, trainable=False, name='embedding')

In [165]:
tf.global_variables_initializer().run()

In [166]:
path = 'tensorboard'

In [167]:
saver = tf.train.Saver()

In [168]:
writer = tf.summary.FileWriter(path, sess.graph)

In [171]:
config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [174]:
projector.visualize_embeddings(writer, config)

saver.save(sess, path+'/model.ckpt', global_step=max_size)

'tensorboard/model.ckpt-30326'