In [2]:
import numpy as np
import tensorflow as tf

corpus = "बिहीबार ४ जनाले जिल्ला अदालत मोरङसमक्ष थुनछेक बहस गरेका थिए । थप ३ जनाले शुक्रबार बहस गर्दै उपलब्ध प्रमाणका आधारमा प्रतिवादीहरु कसुरदार देखिएको भन्दै थुनामै राखेर थप अनुसन्धान र कारबाही अघि "
# corpus_raw = corpus.lower() # converting to lowercase

In [3]:
print(corpus_raw.split())

['बिहीबार', '४', 'जनाले', 'जिल्ला', 'अदालत', 'मोरङसमक्ष', 'थुनछेक', 'बहस', 'गरेका', 'थिए', '।', 'थप', '३', 'जनाले', 'शुक्रबार', 'बहस', 'गर्दै', 'उपलब्ध', 'प्रमाणका', 'आधारमा', 'प्रतिवादीहरु', 'कसुरदार', 'देखिएको', 'भन्दै', 'थुनामै', 'राखेर', 'थप', 'अनुसन्धान', 'र', 'कारबाही', 'अघि']


In [4]:
# cleaning the raw corpus and removing the duplicates
words = []
for word in corpus_raw.split():
    if word != '।': # we dont want to treat . as a word
        words.append(word)

words = set(words) # remove all the duplicate words
print(words)

{'शुक्रबार', 'राखेर', 'प्रतिवादीहरु', 'आधारमा', 'उपलब्ध', 'थिए', 'बिहीबार', 'जिल्ला', 'अघि', 'थुनछेक', 'अनुसन्धान', 'थुनामै', 'अदालत', 'प्रमाणका', 'गरेका', 'जनाले', 'र', 'कसुरदार', '३', 'भन्दै', 'थप', 'मोरङसमक्ष', 'बहस', 'कारबाही', 'देखिएको', '४', 'गर्दै'}


In [5]:
word2int = {}
int2word = {}

vocab_size = len(words)

for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word
    
print(word2int['शुक्रबार'])

# print(int2word[2])

0


In [12]:
# splitting the sentences into arrays
raw_sentences = corpus_raw.split('।')
sentences = []

for sentence in raw_sentences:
    sentences.append(sentence.split())
print(sentences)

[['बिहीबार', '४', 'जनाले', 'जिल्ला', 'अदालत', 'मोरङसमक्ष', 'थुनछेक', 'बहस', 'गरेका', 'थिए'], ['थप', '३', 'जनाले', 'शुक्रबार', 'बहस', 'गर्दै', 'उपलब्ध', 'प्रमाणका', 'आधारमा', 'प्रतिवादीहरु', 'कसुरदार', 'देखिएको', 'भन्दै', 'थुनामै', 'राखेर', 'थप', 'अनुसन्धान', 'र', 'कारबाही', 'अघि']]


In [13]:
data = []
WINDOW_SIZE = 2
for sentence in sentences:
    for word_index, word in enumerate(words):
        for nb_word in sentence[max(word_index- WINDOW_SIZE, 0) : min(word_index+WINDOW_SIZE, len(sentence)) + 1]:
            if nb_word != word:
                data.append([word, nb_word])
print('Generating the word pair: ')
print(data)

Generating the word pair: 
[['शुक्रबार', 'बिहीबार'], ['शुक्रबार', '४'], ['शुक्रबार', 'जनाले'], ['राखेर', 'बिहीबार'], ['राखेर', '४'], ['राखेर', 'जनाले'], ['राखेर', 'जिल्ला'], ['प्रतिवादीहरु', 'बिहीबार'], ['प्रतिवादीहरु', '४'], ['प्रतिवादीहरु', 'जनाले'], ['प्रतिवादीहरु', 'जिल्ला'], ['प्रतिवादीहरु', 'अदालत'], ['आधारमा', '४'], ['आधारमा', 'जनाले'], ['आधारमा', 'जिल्ला'], ['आधारमा', 'अदालत'], ['आधारमा', 'मोरङसमक्ष'], ['उपलब्ध', 'जनाले'], ['उपलब्ध', 'जिल्ला'], ['उपलब्ध', 'अदालत'], ['उपलब्ध', 'मोरङसमक्ष'], ['उपलब्ध', 'थुनछेक'], ['थिए', 'जिल्ला'], ['थिए', 'अदालत'], ['थिए', 'मोरङसमक्ष'], ['थिए', 'थुनछेक'], ['थिए', 'बहस'], ['बिहीबार', 'अदालत'], ['बिहीबार', 'मोरङसमक्ष'], ['बिहीबार', 'थुनछेक'], ['बिहीबार', 'बहस'], ['बिहीबार', 'गरेका'], ['जिल्ला', 'मोरङसमक्ष'], ['जिल्ला', 'थुनछेक'], ['जिल्ला', 'बहस'], ['जिल्ला', 'गरेका'], ['जिल्ला', 'थिए'], ['अघि', 'थुनछेक'], ['अघि', 'बहस'], ['अघि', 'गरेका'], ['अघि', 'थिए'], ['थुनछेक', 'बहस'], ['थुनछेक', 'गरेका'], ['थुनछेक', 'थिए'], ['अनुसन्धान', 'गरेका'], ['अनुसन्धा

In [14]:
# now convert all to one_hot_vectors
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = [] # input_word
y_train = [] # output_word

for data_word in data:
    x_train.append(to_one_hot( word2int[ data_word[0] ], vocab_size ) ) # convert to one_hot using the index returned from word2int
    y_train.append(to_one_hot( word2int[ data_word[1] ], vocab_size ) )
    
print(x_train)

[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([

In [15]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
print('Converting to numpy array: ')
print(x_train)

Converting to numpy array: 
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [16]:
print(x_train.shape, y_train.shape)

(141, 27) (141, 27)


In [17]:
# we make tensorflow placeholders
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [18]:
# embedding networks weight and biases
EMBEDDING_DIM = 5

w1 = tf.Variable( tf.random_normal([vocab_size, EMBEDDING_DIM]) ) #weight
b1 = tf.Variable( tf.random_normal([EMBEDDING_DIM]) ) #bias

hidden_representation = tf.add( tf.matmul(x, w1), b1)

In [19]:
w2 = tf.Variable( tf.random_normal([EMBEDDING_DIM, vocab_size]) )
b2 = tf.Variable( tf.random_normal( [vocab_size] ))

prediction = tf.nn.softmax( tf.add( tf.matmul(hidden_representation, w2), b2) )

In [20]:
# now we train the neural network
sess = tf.Session()
init = tf.global_variables_initializer()

sess.run(init)

In [21]:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]) )

train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

In [22]:
n_iters = 2000

for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is: ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train} ) )

loss is:  6.3328037
loss is:  6.048159
loss is:  5.832963
loss is:  5.666474
loss is:  5.5345235
loss is:  5.427516
loss is:  5.338858
loss is:  5.263923
loss is:  5.199395
loss is:  5.1428595
loss is:  5.0925274
loss is:  5.047065
loss is:  5.0054646
loss is:  4.9669647
loss is:  4.930985
loss is:  4.897078
loss is:  4.8649
loss is:  4.8341856
loss is:  4.804729
loss is:  4.7763667
loss is:  4.7489705
loss is:  4.7224393
loss is:  4.6966906
loss is:  4.671658
loss is:  4.6472864
loss is:  4.6235313
loss is:  4.6003532
loss is:  4.57772
loss is:  4.555605
loss is:  4.533981
loss is:  4.512828
loss is:  4.492125
loss is:  4.4718556
loss is:  4.452002
loss is:  4.4325504
loss is:  4.4134865
loss is:  4.3947964
loss is:  4.3764687
loss is:  4.358491
loss is:  4.3408527
loss is:  4.3235426
loss is:  4.3065515
loss is:  4.2898693
loss is:  4.2734866
loss is:  4.257394
loss is:  4.241583
loss is:  4.2260466
loss is:  4.210776
loss is:  4.195763
loss is:  4.1810017
loss is:  4.1664844
loss is

loss is:  2.677271
loss is:  2.6760242
loss is:  2.6747797
loss is:  2.6735377
loss is:  2.6722987
loss is:  2.6710622
loss is:  2.6698284
loss is:  2.6685972
loss is:  2.6673687
loss is:  2.6661427
loss is:  2.6649191
loss is:  2.6636982
loss is:  2.6624799
loss is:  2.661264
loss is:  2.6600504
loss is:  2.6588395
loss is:  2.6576312
loss is:  2.656425
loss is:  2.6552215
loss is:  2.6540205
loss is:  2.6528218
loss is:  2.6516256
loss is:  2.6504314
loss is:  2.6492403
loss is:  2.6480508
loss is:  2.646864
loss is:  2.6456797
loss is:  2.6444972
loss is:  2.6433172
loss is:  2.6421397
loss is:  2.6409643
loss is:  2.6397913
loss is:  2.6386204
loss is:  2.637452
loss is:  2.6362855
loss is:  2.6351213
loss is:  2.6339593
loss is:  2.6327996
loss is:  2.6316419
loss is:  2.6304862
loss is:  2.629333
loss is:  2.628182
loss is:  2.627033
loss is:  2.6258862
loss is:  2.6247408
loss is:  2.6235983
loss is:  2.6224575
loss is:  2.6213188
loss is:  2.6201823
loss is:  2.6190479
loss is:

loss is:  2.2821229
loss is:  2.2814798
loss is:  2.2808378
loss is:  2.280197
loss is:  2.2795572
loss is:  2.2789192
loss is:  2.278282
loss is:  2.2776463
loss is:  2.2770116
loss is:  2.2763786
loss is:  2.2757466
loss is:  2.2751157
loss is:  2.2744865
loss is:  2.2738583
loss is:  2.2732313
loss is:  2.2726054
loss is:  2.2719812
loss is:  2.271358
loss is:  2.2707362
loss is:  2.2701154
loss is:  2.269496
loss is:  2.2688782
loss is:  2.268261
loss is:  2.2676456
loss is:  2.2670314
loss is:  2.2664185
loss is:  2.2658064
loss is:  2.265196
loss is:  2.2645867
loss is:  2.2639787
loss is:  2.2633722
loss is:  2.2627666
loss is:  2.2621622
loss is:  2.2615592
loss is:  2.260958
loss is:  2.2603574
loss is:  2.2597582
loss is:  2.2591603
loss is:  2.2585638
loss is:  2.2579684
loss is:  2.2573745
loss is:  2.2567813
loss is:  2.25619
loss is:  2.2555997
loss is:  2.2550108
loss is:  2.254423
loss is:  2.2538366
loss is:  2.2532513
loss is:  2.2526674
loss is:  2.2520845
loss is:  

loss is:  2.100655
loss is:  2.100452
loss is:  2.1002495
loss is:  2.1000473
loss is:  2.0998461
loss is:  2.0996451
loss is:  2.0994446
loss is:  2.0992446
loss is:  2.0990448
loss is:  2.098846
loss is:  2.098647
loss is:  2.098449
loss is:  2.0982513
loss is:  2.0980542
loss is:  2.097857
loss is:  2.0976613
loss is:  2.097465
loss is:  2.0972698
loss is:  2.097075
loss is:  2.0968802
loss is:  2.0966861
loss is:  2.0964928
loss is:  2.0962994
loss is:  2.0961068
loss is:  2.0959146
loss is:  2.095723
loss is:  2.0955315
loss is:  2.0953405
loss is:  2.0951502
loss is:  2.0949602
loss is:  2.0947702
loss is:  2.0945814
loss is:  2.0943928
loss is:  2.0942044
loss is:  2.0940166
loss is:  2.093829
loss is:  2.0936422
loss is:  2.0934553
loss is:  2.0932696
loss is:  2.0930839
loss is:  2.0928986
loss is:  2.0927136
loss is:  2.0925288
loss is:  2.092345
loss is:  2.0921614
loss is:  2.0919783
loss is:  2.0917957
loss is:  2.091613
loss is:  2.0914314
loss is:  2.0912497
loss is:  2.

loss is:  2.0387561
loss is:  2.0386748
loss is:  2.038594
loss is:  2.0385132
loss is:  2.0384321
loss is:  2.0383515
loss is:  2.0382707
loss is:  2.0381904
loss is:  2.03811
loss is:  2.0380297
loss is:  2.0379493
loss is:  2.0378697
loss is:  2.0377896
loss is:  2.03771
loss is:  2.0376303
loss is:  2.037551
loss is:  2.0374715
loss is:  2.0373921
loss is:  2.037313
loss is:  2.0372338
loss is:  2.037155
loss is:  2.037076
loss is:  2.0369976
loss is:  2.0369186
loss is:  2.0368404
loss is:  2.036762
loss is:  2.0366836
loss is:  2.0366056
loss is:  2.0365279
loss is:  2.0364494
loss is:  2.0363715
loss is:  2.036294
loss is:  2.0362165
loss is:  2.0361392
loss is:  2.0360615
loss is:  2.0359845
loss is:  2.0359075
loss is:  2.0358303
loss is:  2.0357537
loss is:  2.0356765
loss is:  2.0356
loss is:  2.0355234
loss is:  2.0354471
loss is:  2.0353706
loss is:  2.0352943
loss is:  2.0352182
loss is:  2.0351424
loss is:  2.0350661
loss is:  2.0349905
loss is:  2.034915
loss is:  2.034

In [23]:
print(sess.run(w1))

[[ 2.264247   -0.41725922 -1.4768288  -0.37130958  0.45540696]
 [ 1.520511   -1.0673778  -0.77568173  0.28071332  0.7481448 ]
 [ 1.3156799  -1.2172872   0.08095276  0.11522432  1.0912459 ]
 [ 0.78956795 -1.4058923   0.10220371  0.99333316  0.34635052]
 [ 0.26297823 -2.2856889   0.3196364   1.3284935  -0.19323258]
 [-0.5427734  -1.5875663   0.24913833  3.48033    -1.2983516 ]
 [-1.0829904  -0.4992789   1.0056132   1.5489657   0.22153309]
 [-1.7959106  -0.03273933  0.8328702   2.336976   -0.44873044]
 [-0.7728766   0.02592054 -0.06531389  2.2033794   0.6716655 ]
 [-0.5338104   0.4372206  -0.54687166  2.374825    0.05574702]
 [-0.69851404  1.2253596  -0.7222803   1.3455998  -0.08960064]
 [-0.31501937  1.658966   -1.1831833   1.2943054  -0.18317729]
 [ 0.0112107   2.0311317  -2.2743087  -0.29924643 -0.2685427 ]
 [-1.7275635   0.46908617 -1.8699778   0.16464368 -1.5075141 ]
 [-1.0391134   0.85410744 -1.2025309  -1.0926648   0.1172104 ]
 [-1.049586    0.80048573 -0.46096337 -1.2719802   0.55

In [24]:
vectors = sess.run(w1 + b1)
print(vectors)

[[ 2.3827703  -0.4582397  -0.99734366 -0.948996   -0.00789574]
 [ 1.6390344  -1.1083583  -0.29619655 -0.2969731   0.2848421 ]
 [ 1.4342033  -1.2582676   0.5604379  -0.4624621   0.62794316]
 [ 0.90809125 -1.4468727   0.5816889   0.41564673 -0.11695218]
 [ 0.38150156 -2.3266695   0.7991216   0.75080705 -0.65653527]
 [-0.42425013 -1.6285467   0.7286235   2.9026437  -1.7616544 ]
 [-0.9644671  -0.54025936  1.4850984   0.97127926 -0.24176961]
 [-1.6773872  -0.07371981  1.3123554   1.7592896  -0.91203314]
 [-0.6543533  -0.01505995  0.41417128  1.625693    0.20836279]
 [-0.41528708  0.39624012 -0.06738648  1.7971386  -0.40755567]
 [-0.57999074  1.1843791  -0.24279514  0.76791334 -0.55290335]
 [-0.19649605  1.6179855  -0.70369816  0.716619   -0.64647996]
 [ 0.12973401  1.9901513  -1.7948235  -0.87693286 -0.7318454 ]
 [-1.6090401   0.42810568 -1.3904927  -0.41304275 -1.9708169 ]
 [-0.9205901   0.8131269  -0.7230457  -1.6703513  -0.3460923 ]
 [-0.93106276  0.7595053   0.01852182 -1.8496666   0.08

In [26]:
print( vectors[ word2int['बिहीबार'] ] )

[-0.9644671  -0.54025936  1.4850984   0.97127926 -0.24176961]


In [27]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum( (vec1-vec2)**2 ) )

def find_closest(word_index, vectors):
    min_dist = 10000 # lo act like pos infinity
    min_index  = -1
    
    query_vector = vectors[word_index]
    
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
            
    return min_index

In [29]:
print(int2word[find_closest(word2int['बिहीबार'], vectors)])
# print(int2word[find_closest(word2int['queen'], vectors)])
# print(int2word[find_closest(word2int['royal'], vectors)])

जिल्ला
