# Word2Vec Tensorflow

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm as tqdm

  from ._conv import register_converters as _register_converters


In [2]:
corpus=['I love Banana juice','I love apple juice also','I love Orange juice also','Red is my fav color','I like black',
       'I like blue color','I hate fruits specially guava or mango and grapes','Pink is color for girls and black is color for boys',
       'green is color for Agriculture','white is color for peace','blue is color for hope']

In [3]:
def lower(corpus):
    corpus=[item.lower() for item in corpus]
    return corpus

In [4]:
corpus=lower(corpus)

In [5]:
def remove_stop_words(corpus):
    stop_words=['I','i','also','my','is','for','and','or']
    results = []
    for text in corpus:
        tmp = text.split(' ')
        for stop_word in stop_words:
            if stop_word in tmp:
                tmp.remove(stop_word)
        results.append(tmp)    
    return results

In [6]:
corpus=remove_stop_words(corpus)
corpus

[['love', 'banana', 'juice'],
 ['love', 'apple', 'juice'],
 ['love', 'orange', 'juice'],
 ['red', 'fav', 'color'],
 ['like', 'black'],
 ['like', 'blue', 'color'],
 ['hate', 'fruits', 'specially', 'guava', 'mango', 'grapes'],
 ['pink', 'color', 'girls', 'black', 'is', 'color', 'for', 'boys'],
 ['green', 'color', 'agriculture'],
 ['white', 'color', 'peace'],
 ['blue', 'color', 'hope']]

In [7]:
## Number of Unique words in Corpus
words={}
for wds in range(len(corpus)):
    for w in corpus[wds]:
        if w in words:
            words[w]=words[w]+1
        else:
            words[w]=1

In [8]:
## Creating List Related to the dictonary keys of words
merge_list=[]
for k,v in words.items():
    merge_list.append(k)
merge_list=sorted(merge_list)

In [9]:
## Dictionary with Words2Indexing
word2index={merge_list[i]:i for i in range(0,len(merge_list))}
##Index2Words
Index2Words = dict((v,k) for k,v in word2index.items())

In [10]:
### Change the character of strings into Number of index_words
text=[]
for i in range(len(corpus)):
    text.append([])
    for xx in corpus[i]:
        text[i].append(xx.replace(xx,str(word2index[xx])))
        
print('Integer Data',text[5],'\nCharater Data',corpus[5])   

Integer Data ['18', '4', '6'] 
Charater Data ['like', 'blue', 'color']


In [11]:
### Building N-Grams
mega_corpus=[]
for i in range(len(corpus)):
    for xx in corpus[i]:
        mega_corpus.append(xx)   

In [12]:
### Creating Only 2-Grams For Test purposes
n=2
N_GramModel=[mega_corpus[i:i+n] for i in range(len(mega_corpus)-1)]
## Only Considering First 5 Value
N_GramModel[0:5]

[['love', 'banana'],
 ['banana', 'juice'],
 ['juice', 'love'],
 ['love', 'apple'],
 ['apple', 'juice']]

In [13]:
data = []
WINDOW_SIZE = 2
for w in corpus:
    for word_index, word in enumerate(w):
        for nb_word in w[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(w)) + 1] : 
            if nb_word != word:
                data.append([word, nb_word])

In [14]:
df=pd.DataFrame(data,columns=['input','Label'])
df.head()

Unnamed: 0,input,Label
0,love,banana
1,love,juice
2,banana,love
3,banana,juice
4,juice,love


In [15]:
## Creating Input data 
Input=[]
for x in df.iloc[:,0]:
    Input.append(word2index[x])

X_train=[]
for k in Input:
    temp=np.zeros(len(word2index))
    temp[k]=1    
    X_train.append(temp)

In [16]:
## Creating label data 
label=[]
for x in df.iloc[:,1]:
    label.append(word2index[x])
    
Y_train=[]
for k in label:
    temp=np.zeros(len(word2index))
    temp[k]=1    
    Y_train.append(temp)

In [17]:
# convert them to numpy arrays
X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)

In [18]:
print('1)Both the shapes Array shold be Same','\n2)Column is the unique value present in the corpus','\n3)Rows are the length of dataframe after doing N_Gram')

print('Shape of Y_train is',Y_train.shape,'\nShape of X_Train is',X_train.shape)


1)Both the shapes Array shold be Same 
2)Column is the unique value present in the corpus 
3)Rows are the length of dataframe after doing N_Gram
Shape of Y_train is (94, 27) 
Shape of X_Train is (94, 27)


In [19]:
## None is their because we don't know the shape of X_train sometimes it us 1*vocab_size  or sometimes it is vocab_size*1
vocab_size=len(word2index) ## Always length of a Unique Dictionary
x=tf.placeholder(tf.float32, shape=(None,vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [20]:
## word-Embedding
Embed_dim=5
W1 = tf.Variable(tf.random_normal([vocab_size, Embed_dim],-1,1))
b1=tf.Variable(tf.random_normal([Embed_dim],-1,1))
hidden_matrix=tf.add((tf.matmul(x,W1)),b1)

## out-put layer after softmax
W2 = tf.Variable(tf.random_normal([Embed_dim,vocab_size],-1,1))
b2=tf.Variable(tf.random_normal([vocab_size],-1,1))

### Prediction using Loss Function
prediction=tf.nn.softmax((tf.add(tf.matmul(hidden_matrix,W2),b2)))


### Creating Cross-Entropy loss Function
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

## Adam-optimizers
train_optimizer = tf.train.AdamOptimizer(0.0001).minimize(loss)

### Training of Data Begins !!!

In [21]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 25000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_optimizer, feed_dict={x: X_train, y_label: Y_train})
    if i % 3000 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

iteration 0 loss is :  8.03913
iteration 3000 loss is :  4.530759
iteration 6000 loss is :  3.1503572
iteration 9000 loss is :  2.472372
iteration 12000 loss is :  2.0291445
iteration 15000 loss is :  1.7289271
iteration 18000 loss is :  1.5495336
iteration 21000 loss is :  1.4450225
iteration 24000 loss is :  1.3871846


In [22]:
print(sess.run(W1))
print('----------')
print(sess.run(b1))
print('----------')

[[ 1.3700979  -0.97079873 -2.654842   -1.8231133  -2.0968208 ]
 [-2.331005    1.4998215   0.74571127 -2.5238714  -1.920799  ]
 [-2.397241    0.501312    0.05894314 -2.3700252  -1.2363259 ]
 [ 0.11282279  1.1350769  -1.6619006  -0.859465    0.15081091]
 [-0.12135387  0.31814733 -2.591023   -3.4166393  -0.6542811 ]
 [-3.160065   -2.0433376  -3.0914454  -0.53239715  0.50922614]
 [ 0.34005225 -2.0923467  -0.28079972 -0.37317604 -0.5394984 ]
 [ 0.8324868  -2.4231892  -2.8673067   1.3878247  -2.9991071 ]
 [ 0.72472584  1.0234027  -1.1969881   0.8705071  -1.3692926 ]
 [ 1.3087094  -1.2495974   1.0088114  -1.4078223   1.8136746 ]
 [-1.9975368  -1.5398446  -1.394702   -2.8620372  -2.0040293 ]
 [-3.3162386   1.2775453  -0.74179065 -1.9206384   1.903526  ]
 [-1.3848425  -2.4125936  -2.2718928   0.77027994 -1.089366  ]
 [-3.3436072  -2.5958495   2.001141   -1.7701273   1.360983  ]
 [-0.46836463 -3.1949122   1.8129276  -0.6182739   1.8762034 ]
 [-3.1148636  -4.203465   -3.0103023  -1.3113291  -2.51

In [23]:
vectors = sess.run(W1 + b1)
print(vectors)

[[ 1.4622468  -0.04778427 -2.8564005  -1.6020342  -2.0181818 ]
 [-2.238856    2.422836    0.5441526  -2.3027923  -1.8421601 ]
 [-2.305092    1.4243264  -0.14261554 -2.148946   -1.157687  ]
 [ 0.20497173  2.0580914  -1.8634593  -0.6383859   0.22944987]
 [-0.02920493  1.2411618  -2.7925816  -3.1955602  -0.5756421 ]
 [-3.067916   -1.1203232  -3.293004   -0.31131804  0.5878651 ]
 [ 0.43220118 -1.1693323  -0.4823584  -0.15209694 -0.46085942]
 [ 0.92463577 -1.5001748  -3.0688653   1.6089038  -2.920468  ]
 [ 0.8168748   1.9464171  -1.3985468   1.0915862  -1.2906537 ]
 [ 1.4008583  -0.32658297  0.80725265 -1.1867431   1.8923135 ]
 [-1.9053879  -0.61683017 -1.5962607  -2.640958   -1.9253904 ]
 [-3.2240896   2.2005599  -0.94334936 -1.6995593   1.9821649 ]
 [-1.2926936  -1.4895792  -2.4734514   0.99135906 -1.010727  ]
 [-3.2514582  -1.6728351   1.7995824  -1.5490482   1.4396219 ]
 [-0.3762157  -2.2718978   1.6113689  -0.3971948   1.9548423 ]
 [-3.0227146  -3.2804506  -3.211861   -1.09025    -2.43

In [24]:
print(vectors[ word2index['banana'] ])

[-2.305092    1.4243264  -0.14261554 -2.148946   -1.157687  ]
