In [2]:
import numpy as np
import tensorflow as tf

corpus = "बिहीबार ४ जनाले जिल्ला अदालत मोरङसमक्ष थुनछेक बहस गरेका थिए । थप ३ जनाले शुक्रबार बहस गर्दै उपलब्ध प्रमाणका आधारमा प्रतिवादीहरु कसुरदार देखिएको भन्दै थुनामै राखेर थप अनुसन्धान र कारबाही अघि "
# corpus_raw = corpus.lower() # converting to lowercase

In [3]:
print(corpus_raw.split())

['बिहीबार', '४', 'जनाले', 'जिल्ला', 'अदालत', 'मोरङसमक्ष', 'थुनछेक', 'बहस', 'गरेका', 'थिए', '।', 'थप', '३', 'जनाले', 'शुक्रबार', 'बहस', 'गर्दै', 'उपलब्ध', 'प्रमाणका', 'आधारमा', 'प्रतिवादीहरु', 'कसुरदार', 'देखिएको', 'भन्दै', 'थुनामै', 'राखेर', 'थप', 'अनुसन्धान', 'र', 'कारबाही', 'अघि']


In [4]:
# cleaning the raw corpus and removing the duplicates
words = []
for word in corpus_raw.split():
    if word != '।': # we dont want to treat . as a word
        words.append(word)

words = set(words) # remove all the duplicate words
print(words)

{'शुक्रबार', 'राखेर', 'प्रतिवादीहरु', 'आधारमा', 'उपलब्ध', 'थिए', 'बिहीबार', 'जिल्ला', 'अघि', 'थुनछेक', 'अनुसन्धान', 'थुनामै', 'अदालत', 'प्रमाणका', 'गरेका', 'जनाले', 'र', 'कसुरदार', '३', 'भन्दै', 'थप', 'मोरङसमक्ष', 'बहस', 'कारबाही', 'देखिएको', '४', 'गर्दै'}


In [5]:
word2int = {}
int2word = {}

vocab_size = len(words)

for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word
    
print(word2int['शुक्रबार'])

# print(int2word[2])

0


In [10]:
# splitting the sentences into arrays
raw_sentences = corpus_raw.split('।')
sentences = []

for sentence in raw_sentences:
    sentences.append(sentence.split())
print(sentences)

[['बिहीबार', '४', 'जनाले', 'जिल्ला', 'अदालत', 'मोरङसमक्ष', 'थुनछेक', 'बहस', 'गरेका', 'थिए', '।', 'थप', '३', 'जनाले', 'शुक्रबार', 'बहस', 'गर्दै', 'उपलब्ध', 'प्रमाणका', 'आधारमा', 'प्रतिवादीहरु', 'कसुरदार', 'देखिएको', 'भन्दै', 'थुनामै', 'राखेर', 'थप', 'अनुसन्धान', 'र', 'कारबाही', 'अघि']]


In [11]:
data = []
WINDOW_SIZE = 2
for sentence in sentences:
    for word_index, word in enumerate(words):
        for nb_word in sentence[max(word_index- WINDOW_SIZE, 0) : min(word_index+WINDOW_SIZE, len(sentence)) + 1]:
            if nb_word != word:
                data.append([word, nb_word])
print('Generating the word pair: ')
print(data)

Generating the word pair: 
[['शुक्रबार', 'बिहीबार'], ['शुक्रबार', '४'], ['शुक्रबार', 'जनाले'], ['राखेर', 'बिहीबार'], ['राखेर', '४'], ['राखेर', 'जनाले'], ['राखेर', 'जिल्ला'], ['प्रतिवादीहरु', 'बिहीबार'], ['प्रतिवादीहरु', '४'], ['प्रतिवादीहरु', 'जनाले'], ['प्रतिवादीहरु', 'जिल्ला'], ['प्रतिवादीहरु', 'अदालत'], ['आधारमा', '४'], ['आधारमा', 'जनाले'], ['आधारमा', 'जिल्ला'], ['आधारमा', 'अदालत'], ['आधारमा', 'मोरङसमक्ष'], ['उपलब्ध', 'जनाले'], ['उपलब्ध', 'जिल्ला'], ['उपलब्ध', 'अदालत'], ['उपलब्ध', 'मोरङसमक्ष'], ['उपलब्ध', 'थुनछेक'], ['थिए', 'जिल्ला'], ['थिए', 'अदालत'], ['थिए', 'मोरङसमक्ष'], ['थिए', 'थुनछेक'], ['थिए', 'बहस'], ['बिहीबार', 'अदालत'], ['बिहीबार', 'मोरङसमक्ष'], ['बिहीबार', 'थुनछेक'], ['बिहीबार', 'बहस'], ['बिहीबार', 'गरेका'], ['जिल्ला', 'मोरङसमक्ष'], ['जिल्ला', 'थुनछेक'], ['जिल्ला', 'बहस'], ['जिल्ला', 'गरेका'], ['जिल्ला', 'थिए'], ['अघि', 'थुनछेक'], ['अघि', 'बहस'], ['अघि', 'गरेका'], ['अघि', 'थिए'], ['अघि', '।'], ['थुनछेक', 'बहस'], ['थुनछेक', 'गरेका'], ['थुनछेक', 'थिए'], ['थुनछेक', '।'], ['थ

In [8]:
# now convert all to one_hot_vectors
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = [] # input_word
y_train = [] # output_word

for data_word in data:
    x_train.append(to_one_hot( word2int[ data_word[0] ], vocab_size ) ) # convert to one_hot using the index returned from word2int
    y_train.append(to_one_hot( word2int[ data_word[1] ], vocab_size ) )
    
print(x_train)

KeyError: '।'

In [11]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
print('Converting to numpy array: ')
print(x_train)

Converting to numpy array: 
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]]


In [12]:
print(x_train.shape, y_train.shape)

(34, 7) (34, 7)


In [13]:
# we make tensorflow placeholders
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [15]:
# embedding networks weight and biases
EMBEDDING_DIM = 5

w1 = tf.Variable( tf.random_normal([vocab_size, EMBEDDING_DIM]) ) #weight
b1 = tf.Variable( tf.random_normal([EMBEDDING_DIM]) ) #bias

hidden_representation = tf.add( tf.matmul(x, w1), b1)

In [16]:
w2 = tf.Variable( tf.random_normal([EMBEDDING_DIM, vocab_size]) )
b2 = tf.Variable( tf.random_normal( [vocab_size] ))

prediction = tf.nn.softmax( tf.add( tf.matmul(hidden_representation, w2), b2) )

In [17]:
# now we train the neural network
sess = tf.Session()
init = tf.global_variables_initializer()

sess.run(init)

In [19]:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]) )

train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

In [20]:
n_iters = 1000

for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is: ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train} ) )

loss is:  5.7345777
loss is:  5.0509543
loss is:  4.545597
loss is:  4.170566
loss is:  3.879713
loss is:  3.6407878
loss is:  3.437089
loss is:  3.261154
loss is:  3.1091557
loss is:  2.9782014
loss is:  2.8654358
loss is:  2.7679555
loss is:  2.6830075
loss is:  2.6081781
loss is:  2.5414877
loss is:  2.481382
loss is:  2.4266763
loss is:  2.376478
loss is:  2.330114
loss is:  2.2870743
loss is:  2.2469664
loss is:  2.2094822
loss is:  2.1743777
loss is:  2.1414545
loss is:  2.1105473
loss is:  2.0815191
loss is:  2.0542505
loss is:  2.0286362
loss is:  2.0045817
loss is:  1.981998
loss is:  1.9608012
loss is:  1.9409093
loss is:  1.9222423
loss is:  1.9047214
loss is:  1.88827
loss is:  1.8728119
loss is:  1.8582753
loss is:  1.84459
loss is:  1.8316905
loss is:  1.8195148
loss is:  1.8080053
loss is:  1.7971095
loss is:  1.7867782
loss is:  1.7769674
loss is:  1.7676368
loss is:  1.7587494
loss is:  1.7502724
loss is:  1.7421753
loss is:  1.7344314
loss is:  1.727016
loss is:  1.71

loss is:  1.3539326
loss is:  1.3538166
loss is:  1.3537011
loss is:  1.353586
loss is:  1.3534718
loss is:  1.3533578
loss is:  1.3532444
loss is:  1.3531317
loss is:  1.3530195
loss is:  1.3529079
loss is:  1.3527969
loss is:  1.3526863
loss is:  1.3525763
loss is:  1.3524668
loss is:  1.3523579
loss is:  1.3522495
loss is:  1.3521416
loss is:  1.3520343
loss is:  1.3519275
loss is:  1.3518212
loss is:  1.3517154
loss is:  1.3516101
loss is:  1.3515055
loss is:  1.3514012
loss is:  1.3512976
loss is:  1.3511944
loss is:  1.3510917
loss is:  1.3509893
loss is:  1.3508878
loss is:  1.3507866
loss is:  1.3506858
loss is:  1.3505856
loss is:  1.3504858
loss is:  1.3503866
loss is:  1.3502877
loss is:  1.3501894
loss is:  1.3500917
loss is:  1.3499944
loss is:  1.3498975
loss is:  1.3498011
loss is:  1.3497053
loss is:  1.3496097
loss is:  1.3495147
loss is:  1.3494201
loss is:  1.3493261
loss is:  1.3492323
loss is:  1.3491392
loss is:  1.3490465
loss is:  1.3489541
loss is:  1.3488624
l

In [21]:
print(sess.run(w1))

[[-0.35916016  0.21275197 -1.2531048  -1.1059759  -1.7222703 ]
 [-1.64989    -1.1994785  -0.75585455  1.464288   -0.18662998]
 [ 0.4316211   0.24706587  0.18025362 -0.16986072  0.814561  ]
 [-0.8777746  -0.05945565  1.2594256  -0.51478124 -0.635934  ]
 [ 3.0930665   0.07628312 -0.26005045  1.3107793  -0.16534917]
 [-1.1520251   1.0098528   0.55358887 -0.5931492   0.80558366]
 [-0.34872076 -0.22793616  1.1563816  -0.50491685 -2.156127  ]]


In [22]:
vectors = sess.run(w1 + b1)
print(vectors)

[[ 0.6679405   1.7584124  -1.9933364  -1.0066311  -2.0391376 ]
 [-0.62278926  0.34618187 -1.4960862   1.5636327  -0.50349724]
 [ 1.4587218   1.7927263  -0.55997807 -0.07051596  0.49769372]
 [ 0.14932609  1.4862047   0.51919395 -0.41543648 -0.9528013 ]
 [ 4.1201673   1.6219435  -1.0002822   1.4101241  -0.48221648]
 [-0.12492442  2.5555131  -0.18664283 -0.49380443  0.48871636]
 [ 0.6783799   1.3177242   0.4161499  -0.4055721  -2.4729943 ]]


In [23]:
print( vectors[ word2int['queen'] ] )

[ 1.4587218   1.7927263  -0.55997807 -0.07051596  0.49769372]


In [24]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum( (vec1-vec2)**2 ) )

def find_closest(word_index, vectors):
    min_dist = 10000 # lo act like pos infinity
    min_index  = -1
    
    query_vector = vectors[word_index]
    
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
            
    return min_index

In [25]:
print(int2word[find_closest(word2int['king'], vectors)])
print(int2word[find_closest(word2int['queen'], vectors)])
print(int2word[find_closest(word2int['royal'], vectors)])

queen
king
she
