In [None]:
!pip install keras
!pip install tensorflow
!pip install plot_keras_history
!pip install seaborn



In [1]:

from keras.utils import np_utils
from keras.preprocessing import sequence

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Reshape
from keras.layers import Input
from keras.models import Model
from keras.layers import dot
from tensorflow.keras.activations import relu
from nltk import word_tokenize, sent_tokenize
from gensim.corpora.dictionary import Dictionary
import numpy as np

from keras.preprocessing.sequence import skipgrams
import gensim


In [2]:
# using nltk tokenizer.  
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
#Data Preparation 

AlotOftext = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()



#Tokenize text
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(AlotOftext)]

#Create Vocab as a Dictionary
vocab = Dictionary(tokenized_text)
print(dict(vocab.items()))

print(vocab.token2id['corpora'])
print(vocab[2])
sent0 = tokenized_text[0]
print(vocab.doc2idx(sent0))

vocab.add_documents([['PAD']])
dict(vocab.items())
print(vocab.token2id['PAD'])

corpusByWordID = list()
for sent in  tokenized_text:
    corpusByWordID.append(vocab.doc2idx(sent))

vocab_size = len(vocab)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(vocab.items())[:10])

{0: ',', 1: '.', 2: 'and', 3: 'choose', 4: 'essentially', 5: 'is', 6: 'language', 7: 'never', 8: 'non-random', 9: 'randomly', 10: 'users', 11: 'words', 12: 'a', 13: 'hypothesis', 14: 'null', 15: 'posits', 16: 'randomness', 17: 'statistical', 18: 'testing', 19: 'uses', 20: 'which', 21: 'at', 22: 'be', 23: 'corpora', 24: 'hence', 25: 'in', 26: 'linguistic', 27: 'look', 28: 'phenomena', 29: 'the', 30: 'true', 31: 'we', 32: 'when', 33: 'will', 34: '(', 35: ')', 36: 'able', 37: 'almost', 38: 'always', 39: 'data', 40: 'enough', 41: 'establish', 42: 'it', 43: 'moreover', 44: 'not', 45: 'shall', 46: 'that', 47: 'there', 48: 'to', 49: 'where', 50: 'arbitrary', 51: 'between', 52: 'corpus', 53: 'demonstrably', 54: 'do', 55: 'does', 56: 'fact', 57: 'frequently', 58: 'have', 59: 'inference', 60: 'relation', 61: 'so', 62: 'studies', 63: 'support', 64: 'two', 65: 'are', 66: 'associations', 67: 'evidence', 68: 'experimental', 69: 'frequencies', 70: 'how', 71: 'of', 72: 'present', 73: 'systematically',

In [4]:
# Create CBOW Training data
def generate_cbow_context_word_pairs(corpusByID, window_size, vocab_size):
    context_length = window_size*2
    X=[]
    Y=[]
    for sent in corpusByID:
        sentence_length = len(sent)
        for index, word in enumerate(sent):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sent[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)
            if start<0:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='pre',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            if end>=sentence_length:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='post',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            else:
                X.append(sequence.pad_sequences(context_words, maxlen=context_length))
                Y.append(y)
                continue
           
    return X,Y
            
# Test this out for some samples


X,Y = generate_cbow_context_word_pairs(corpusByWordID, window_size, vocab_size) 
   
for x, y in zip(X,Y):
    print('Context (X):', [vocab[w] for w in x[0]], '-> Target (Y):', vocab[np.argwhere(y[0])[0][0]])


Context (X): ['PAD', 'PAD', 'users', 'never'] -> Target (Y): language
Context (X): ['PAD', 'language', 'never', 'choose'] -> Target (Y): users
Context (X): ['language', 'users', 'choose', 'words'] -> Target (Y): users
Context (X): ['users', 'never', 'words', 'randomly'] -> Target (Y): users
Context (X): ['never', 'choose', 'randomly', ','] -> Target (Y): users
Context (X): ['choose', 'words', ',', 'and'] -> Target (Y): users
Context (X): ['words', 'randomly', 'and', 'language'] -> Target (Y): users
Context (X): ['randomly', ',', 'language', 'is'] -> Target (Y): users
Context (X): [',', 'and', 'is', 'essentially'] -> Target (Y): users
Context (X): ['and', 'language', 'essentially', 'non-random'] -> Target (Y): users
Context (X): ['language', 'is', 'non-random', '.'] -> Target (Y): essentially
Context (X): ['is', 'essentially', '.', 'PAD'] -> Target (Y): non-random
Context (X): ['essentially', 'non-random', 'PAD', 'PAD'] -> Target (Y): .
Context (X): ['PAD', 'PAD', 'hypothesis', 'testing

In [5]:
# len(X)
len(Y)

144

In [6]:
#define the model
cbow = Sequential()
###hint:output_dim = the shape of embedding matrix
###hint:input_length = the length of training sample
cbow.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=144))
cbow.add(Lambda(lambda x: relu(K.mean(x, axis=1)), output_shape=(embed_size,)))
###hint:the total numbser of possible labels/words
###hint:activation='softmax' or 'sigmoid'
cbow.add(Dense(88, activation='sigmoid'))
###hint:loss='categorical_crossentropy' or 'binary_crossentropy'
cbow.compile(loss='binary_crossentropy', optimizer='sgd')
cbow.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 144, 100)          8800      
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 88)                8888      
Total params: 17,688
Trainable params: 17,688
Non-trainable params: 0
_________________________________________________________________


In [7]:
#Train the model

for epoch in range(100):
    loss = 0.
    for x, y in zip(X,Y):
        loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

0 99.48667031526566
1 98.9187878370285
2 98.35555642843246
3 97.79694092273712
4 97.24290537834167
5 96.69340866804123
6 96.14842212200165
7 95.6078953742981
8 95.0717870593071
9 94.54005664587021
10 94.01267874240875
11 93.48961955308914
12 92.97083234786987
13 92.45628726482391
14 91.94595175981522
15 91.43976056575775
16 90.93768805265427
17 90.43970263004303
18 89.94575506448746
19 89.4558168053627
20 88.96984475851059
21 88.4877986907959
22 88.00964426994324
23 87.53535282611847
24 87.06488889455795
25 86.59820902347565
26 86.13528454303741
27 85.6760784983635
28 85.2205656170845
29 84.76870483160019
30 84.32046282291412
31 83.87580567598343
32 83.43471425771713
33 82.99714267253876
34 82.5630350112915
35 82.13236206769943
36 81.70511472225189
37 81.281259059906
38 80.86076408624649
39 80.44361734390259
40 80.02979415655136
41 79.61925864219666
42 79.21195882558823
43 78.80785584449768
44 78.40693473815918
45 78.00914978981018
46 77.61446928977966
47 77.222869515419
48 76.83433371

In [8]:
## Save the wordvectors
f = open('Cbow_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = cbow.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [9]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./Cbow_vectors.txt', binary=False)

w2v.most_similar(positive=['that'])
w2v.most_similar(negative=['that'])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


[(')', 0.21922579407691956),
 ('systematically', 0.1769963800907135),
 ('will', 0.13912953436374664),
 ('often', 0.13879328966140747),
 ('are', 0.13745608925819397),
 ('randomly', 0.12282629311084747),
 ('which', 0.10673969238996506),
 ('enough', 0.10435578972101212),
 ('able', 0.10381244122982025),
 ('show', 0.09622056037187576)]

In [10]:
#Create Skipgram Training data 

# generate skip-grams with both positive and negative examples
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=2) for sent in corpusByWordID]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        vocab[pairs[i][0]], pairs[i][0],           
        vocab[pairs[i][1]], pairs[i][1], 
        labels[i]))

(language (6), experimental (68)) -> 0
(. (1), results (82)) -> 0
(essentially (4), testing (18)) -> 0
(is (5), essentially (4)) -> 1
(non-random (8), is (5)) -> 1
(randomly (9), choose (3)) -> 1
(language (6), or (81)) -> 0
(essentially (4), enough (40)) -> 0
(non-random (8), . (1)) -> 1
(non-random (8), essentially (4)) -> 1


In [11]:
#define the skip-gram model

#define the model
input_word = Input((1,))
input_context_word = Input((1,))

word_embedding = Embedding(input_dim=vocab_size, output_dim=100,input_length=1,name='word_embedding')
context_embedding = Embedding(input_dim=vocab_size, output_dim=100,input_length=1,name='context_embedding')

word_embedding = word_embedding(input_word)
word_embedding_layer = Reshape((embed_size, 1))(word_embedding)

context_embedding = context_embedding(input_context_word)
context_embedding_layer = Reshape((embed_size, 1))(context_embedding)

# now perform the dot product operation word_embedding_vec * context_embedding_vec
dot_product = dot([word_embedding_layer, context_embedding_layer], axes=1)
dot_product = Reshape((1,))(dot_product)

###hint:the total number of possible labels/words
###hint:activation='softmax' or 'sigmoid'
outputLayer = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_word, input_context_word], outputs=outputLayer)

###hint:loss='categorical_crossentropy' or 'binary_crossentropy'
model.compile(loss='binary_crossentropy', optimizer='adam')

# view model summary
print(model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 1, 100)       8800        input_1[0][0]                    
__________________________________________________________________________________________________
conotext_embedding (Embedding)  (None, 1, 100)       8800        input_2[0][0]                    
_______________________________________________________________________________________

In [12]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 4.852185547351837
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 4.8475106954574585
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 4.843109667301178
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 4.838240742683411
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 4.832616865634918
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 4.825949430465698
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 4.817927241325378
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 4.808212757110596
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 4.796443045139313
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 4.782229721546173
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 4.765164613723755
Processed 0 (skip_first, skip

In [13]:
#get the embeding matrix
weights = model.get_weights()
## Save the wordvectors
f = open('skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [14]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['the'])
w2v.most_similar(negative=['the'])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


[('almost', 0.30979210138320923),
 ('misleading', 0.2943827509880066),
 ('experimental', 0.274384081363678),
 ('of', 0.2254631221294403),
 ('unhelpful', 0.22091346979141235),
 ('evidence', 0.20839455723762512),
 ('corpora', 0.20792268216609955),
 ('studies', 0.19628621637821198),
 ('relation', 0.18958404660224915),
 ('or', 0.18859079480171204)]

In [None]:
#Excerise: modeify the skipegram_model to share the same embeding layer between word and context
#Discussion: which is better? Why?  