In [1]:
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import bz2
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import nltk # standard preprocessing
import operator # sorting items in dictionary by value
#nltk.download() #tokenizers/punkt/PY3/english.pickle
from math import ceil

  from ._conv import register_converters as _register_converters


In [2]:
from Generating_Batches_of_Data_Skip_Gram_ import *
from data_download import *
from read_data import *
from Build_dictionary import *

# Download Data

In [3]:
url = 'http://www.evanjones.ca/software/'

filename = maybe_download(url,'wikipedia2text-extracted.txt.bz2', 18377035)

Found and verified wikipedia2text-extracted.txt.bz2


# Read data

In [4]:
# Without preprocessing 

words = read_data_without_preprocess(filename=filename)
print('Data size %d' % len(words))
print('Example words (start): ',words[:10])
print('Example words (end): ',words[-10:])

Data size 11631723
Example words (start):  ['Propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']
Example words (end):  ['useless', 'for', 'cultivation', '.', 'and', 'people', 'have', 'sex', 'there', '.']


In [5]:
# with preprocess
words = read_data_with_preprocess(filename=filename)

print('Data size %d' % len(words))
print('Example words (start): ',words[:10])
print('Example words (end): ',words[-10:])

Reading data...
Data size 3360286
Example words (start):  ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']
Example words (end):  ['favorable', 'long-term', 'outcomes', 'for', 'around', 'half', 'of', 'those', 'diagnosed', 'with']


# Building Dictionary

#Builds the following. To understand each of these elements, let us also assume the text "I like to go to school"
    
#dictionary: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})
#reverse_dictionary: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}
#count: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]
#data : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])


In [6]:
# we restrict our vocabulary size to 50000
vocabulary_size = 50000 

In [7]:
# Dictionary build is made of most common words.

data, count, dictionary, reverse_dictionary = build_dataset(words,vocabulary_size=vocabulary_size)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 69215], ('the', 226881), (',', 184013), ('.', 120944), ('of', 116323)]
Sample data [1721, 9, 8, 16471, 223, 4, 5165, 4456, 26, 11590]


# Generating Batches of Data for Skip-Gram
Generates a batch or target words (batch) and a batch of corresponding context words (labels). It reads 2(  window_size)+1 words at a time (called a span) and create 2*(window_size) datapoints in a single span. The function continue in this manner until batch_size datapoints are created. Everytime we reach the end of the word sequence, we start from beginning

In [8]:
print('data:', [reverse_dictionary[di] for di in data[:8]])
for window_size in [1, 2]:
    data_index = 0
    batch, labels = generate_batch_skip_gram(batch_size=8, window_size=window_size,data=data)
    print('\nwith window_size = %d:' %window_size)
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])




data: ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed']

with window_size = 1:
    batch: ['is', 'is', 'a', 'a', 'concerted', 'concerted', 'set', 'set']
    labels: ['propaganda', 'a', 'is', 'concerted', 'a', 'set', 'concerted', 'of']

with window_size = 2:
    batch: ['influencing', 'influencing', 'influencing', 'influencing', 'the', 'the', 'the', 'the']
    labels: ['aimed', 'at', 'the', 'opinions', 'at', 'influencing', 'opinions', 'or']


# Skip Gram

In [9]:
Batch_size = 16 
embedding_size = 128
window_size = 5
vocabulary_size = 50000 


# We pick a random validation set to sample nearest neighbors
valid_size = 16 # Random set of words to evaluate similarity on.
# We sample valid datapoints randomly from a large window without always being deterministic

valid_window = 50
# When selecting valid examples, we select some of the most frequent words as well as
# some moderately rare words as well
valid_examples = np.array(random.sample(range(valid_window), valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)

num_sampled = 32 # Number of negative examples to sample.

# Defining Placeholders

In [10]:
tf.reset_default_graph()


train_dataset = tf.placeholder(tf.int32, shape=[Batch_size])
train_labels = tf.placeholder(tf.int32, shape=[Batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Net Architecture

In [11]:

embedding_space = tf.Variable(tf.random_uniform(shape=[vocabulary_size,embedding_size],minval=-1.0,maxval=1.0,dtype=tf.float32,name='Embedding_layer/Embedding_space'))

softmax_weight = tf.Variable(tf.truncated_normal(shape=[vocabulary_size,embedding_size],stddev=0.5 / math.sqrt(embedding_size)))

softmax_bias = tf.Variable(tf.random_uniform(shape=[vocabulary_size],minval=0.0,maxval=0.01))



# Embedding Lookup Step
#Looking for a particular embedding in embedding space (V * D matrix)

We first defing a lookup function to fetch the corresponding embedding vectors for a set of given inputs. With that, we define negative sampling loss function tf.nn.sampled_softmax_loss which takes in the embedding vectors and previously defined neural network parameter

In [12]:
embed = tf.nn.embedding_lookup(embedding_space, train_dataset)

# Negative Sampling Loss

In [13]:
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weight,biases=softmax_bias,inputs=embed,labels=train_labels,num_sampled=num_sampled,num_classes=vocabulary_size))

# Optimizer

In [14]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

# Running Skip Gram

In [16]:
num_steps = 100001
skip_losses = []
# ConfigProto is a way of providing various configuration settings
# Tensorflow Automatically chooses the device
# required to execute the graph

batch_size =16


with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as session:
    # Initialize the variables in the graph
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0

  # Train the Word2vec model for num_step iterations
    for step in range(num_steps):
        
        # Generate a single batch of data
        batch_data, batch_labels = generate_batch_skip_gram(
          batch_size, window_size,data=data)

        # Populate the feed_dict and run the optimizer (minimize loss)
        # and compute the loss
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)

        # Update the average loss variable
        average_loss += l

        if (step+1) % 2000== 0 and step>0 :
            average_loss = average_loss / 2000
            skip_losses.append(average_loss)
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
        
    saver = tf.train.Saver()
    

    saver.save(sess=session,save_path = 'C:\\Users\\Gaurav_Gola\\Desktop\\Practice\\Word_2_vec\\Model_saved\\word_2vec_trained')
        


Initialized
Average loss at step 2000: 4.646618
Average loss at step 4000: 4.248596
Average loss at step 6000: 4.181313
Average loss at step 8000: 4.347678
Average loss at step 10000: 4.164783
Average loss at step 12000: 4.231609
Average loss at step 14000: 4.359489
Average loss at step 16000: 4.292942
Average loss at step 18000: 4.376451
Average loss at step 20000: 4.279770
Average loss at step 22000: 4.536325
Average loss at step 24000: 4.191920
Average loss at step 26000: 4.287468
Average loss at step 28000: 4.266125
Average loss at step 30000: 4.280929
Average loss at step 32000: 4.377006
Average loss at step 34000: 4.304320
Average loss at step 36000: 4.424525
Average loss at step 38000: 4.289006
Average loss at step 40000: 4.380775
Average loss at step 42000: 4.403068
Average loss at step 44000: 4.369575
Average loss at step 46000: 4.376791
Average loss at step 48000: 4.317907
Average loss at step 50000: 4.419596
Average loss at step 52000: 4.454362
Average loss at step 54000: 4.