In [1]:
import os
os.environ["PATH"] += os.pathsep + "/usr/bin/dot"

In [2]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

data=open('.\corona.txt','r')

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 103
Vocabulary Sample: [('the', 1), ('of', 2), ('influenza', 3), ('covid', 4), ('19', 5), ('virus', 6), ('for', 7), ('transmission', 8), ('is', 9), ('to', 10)]


In [3]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)
            
            
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

In [5]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            10300     
                                                                 
 lambda_1 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 103)               10403     
                                                                 
Total params: 20,703
Trainable params: 20,703
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

In [8]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

import pandas as pd
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(102, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,0.002303,-0.015218,-0.028108,0.019263,0.037953,-0.001704,0.034218,-0.035108,-0.000509,0.025386,...,-0.012602,-0.028938,-0.02615,-0.021317,0.016434,-0.045639,-0.006685,-0.016363,-0.02768,0.010288
influenza,-0.018664,0.020822,0.048426,-0.017216,0.019979,-0.016622,-0.007226,0.02921,-0.036367,-0.040359,...,0.029558,0.020198,0.03375,0.044959,0.033063,-0.022133,0.045925,-0.020575,-0.004028,0.004984
covid,0.004793,0.039811,-0.010168,-0.011238,-0.006769,0.045031,0.034527,0.016331,-0.043984,0.010978,...,-0.004756,-0.047351,0.009472,0.005084,0.025615,0.026702,0.046352,0.03578,-0.044567,0.016125
19,-0.001867,0.005753,0.00319,-0.029014,-0.04763,0.045519,-0.047612,-0.036589,-0.023252,0.021857,...,0.015597,0.024676,-0.006542,0.018721,-0.007959,0.042123,-0.012541,0.002624,-0.040535,0.027223
virus,0.034353,0.031133,-0.043045,0.015426,-0.013243,0.01433,-0.012632,-0.006849,-0.049482,-0.042593,...,0.035583,0.012416,-0.034313,0.014821,-0.002218,0.035151,-0.023438,-0.025654,0.025273,0.029137


In [12]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['virus', 'influenza', 'covid']}

similar_words

(102, 102)


{'virus': ['learning', 'not', 'is', 'means', 'first'],
 'influenza': ['shorter', 'symptomatic', 'we', 'appear', 'a'],
 'covid': ['2', 'cases', 'reproductive', 'making', 'appearance']}