<a href="https://colab.research.google.com/github/geri-m/word2vec/blob/master/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296
import multiprocessing

import nltk
from gensim.models import Word2Vec
from nltk.corpus import brown

nltk.download('brown')
nltk.download('conll2000')

# Data is processed and Tokenised!
sentences = brown.sents()
print(sentences[:3])

EMB_DIM = 300

w2v = Word2Vec(sentences, size=EMB_DIM, window=5, min_count=5, negative=15, iter=10,
               workers=multiprocessing.cpu_count())

word_vectors = w2v.wv  # get trained embeddings - an KeyedVector instaces
result = word_vectors.similar_by_word("Saturday")
print("Most Similar to 'Saturday': %s" % result[:3])

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'poss

  if np.issubdtype(vec.dtype, np.int):


In [3]:
from nltk.corpus import conll2000
from gensim.models import Word2Vec # https://code.google.com/archive/p/word2vec/
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
import numpy as np
import collections

train_words = conll2000.tagged_words('train.txt')
test_words = conll2000.tagged_words('test.txt')
print(train_words[:20])
print("Amount of Trained Word-Tuple: %s" % len(train_words))
print("Amount of Test Word-Tuple: %s" % len(test_words))

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN')]
Amount of Trained Word-Tuple: 211727
Amount of Test Word-Tuple: 47377


In [0]:
def get_tag_vocabulary(tagged_words):
  """
  Accepts text in the form of (word, pos) tuples and returns
  a dictionary mapping POS-tags to unique ids
  """
  tag2id = {}
  for item in tagged_words:
    tag = item[1]
    tag2id.setdefault(tag, len(tag2id))
  return tag2id

# the word_vectors.vocab dictionary stores Vocab objects, rather than integers
# but we would like our dictionary to map words to ints
# the word vector is some the text, we are going to analyse
word2id = {k: v.index for k, v in word_vectors.vocab.items()}
# Result:  {'The': 14, 'Fulton': 5615, 'County': 1280, 'Grand': 5377, 'said': 59, 'Friday': 1852, 'an': 34, ...
tag2id = get_tag_vocabulary(train_words) 
# Result: {'NN': 0, 'IN': 1, 'DT': 2, 'VBZ': 3, 'RB': 4, 'VBN': 5, 'TO': 6, 'VB': 7, 'JJ': 8, 'NNS': 9, 'NNP': 10, ',': 11, 'CC': 12, 'POS': 13, '.': 14, 'VBP': 15, 'VBG': 16, 'PRP$': 17, 'CD': 18, '``': 19, "''": 20, 'VBD': 21, 'EX': 22, 'MD': 23, '#': 24, '(': 25, '$': 26, ')': 27, 'NNPS': 28, 'PRP': 29, 'JJS': 30, 'WP': 31, 'RBR': 32, 'JJR': 33, 'WDT': 34, 'WRB': 35, 'RBS': 36, 'PDT': 37, 'RP': 38, ':': 39, 'FW': 40, 'WP$': 41, 'SYM': 42, 'UH': 43}

In [5]:
UNK_INDEX = 0 # it is generally common to associate UNK with index 0
UNK_TOKEN = "UNK"

def get_int_data(tagged_words, word2id, tag2id):
  """
  Replaces all words and tags with their corresponding ids and
  separates words (features) from the tags (labels). 
  """

  X, Y = [], [] # X will hold word ids, Y will hold ids of their tags
  unk_count = 0 # to keep track of the number of unkonwn words
                # - words we don't have a representation for

  for word, tag in tagged_words:
    Y.append(tag2id.get(tag))
    if word in word2id:
      X.append(word2id.get(word))
    else:
      X.append(UNK_INDEX) # <---- NEW ADDED!
      unk_count += 1
  print("Data Created. percentag of unkown words: %.3f" % (unk_count/len(tagged_words)))
  return np.array(X), np.array(Y)

X_train, Y_train = get_int_data(train_words, word2id, tag2id)
X_test, Y_test = get_int_data(test_words, word2id, tag2id)

print("Result Data: %s, %s" %(len(X_train), len(Y_train)))

print(X_train)
print(Y_train)
# we need to one-hot encode the tag indexes
Y_train, Y_test = to_categorical(Y_train), to_categorical(Y_test)

Data Created. percentag of unkown words: 0.143
Data Created. percentag of unkown words: 0.149
Result Data: 211727, 211727
[   0    7    0 ... 2749  801    2]
[ 0  1  2 ... 10  4 14]


In [0]:
def add_new_word(new_word, new_vector, new_index, embedding_matrix, word2id):
  """
  Adds a new word to the existing matrix of word embeddings.
  """
  # inserts the vector before given index, along axis 0
  embedding_matrix = np.insert(embedding_matrix, [new_index], [new_vector], axis=0)

  # updating the indexes of words that follow the new word
  word2id = {word: (index + 1) if index >= new_index else index 
             for word, index in word2id.items()}
  word2id[new_word] = new_index
  return embedding_matrix, word2id


embedding_matrix = word_vectors.vectors
unk_vector = embedding_matrix.mean(0)
embedding_matrix, word2id = add_new_word(UNK_TOKEN, unk_vector, UNK_INDEX, embedding_matrix, word2id)



In [7]:
HIDDEN_SIZE = 50
BATCH_SIZE = 128

def define_model(embedding_matrix, class_count):
  """
  Create and returns a simple part-of-speech model, which
  takes only one word as input
  """
  vocab_length = len(embedding_matrix)
  model = Sequential() # a sequential model is a stack of layers - we will add them one by one

  # A layer which turns word indexes into vectors
  model.add(Embedding(input_dim = vocab_length,
                      output_dim=EMB_DIM, # output of this layer is the embedding of the input word
                      weights=[embedding_matrix], # the matrix holding the trained embeddings
                      input_length=1)) # specifies how many indexes we are looking up
  model.add(Flatten())
  model.add(Dense(HIDDEN_SIZE))
  model.add(Activation("tanh"))
  model.add(Dense(class_count))
  model.add(Activation("softmax"))

  model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss="categorical_crossentropy",
                metrics=["accuracy"])
  return model

pos_model = define_model(embedding_matrix, len(tag2id))
pos_model.summary()

# Training the model
pos_model.fit(X_train,
              Y_train,
              batch_size=BATCH_SIZE,
              epochs=1,
              verbose=1)



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 300)            4552200   
_________________________________________________________________
flatten (Flatten)            (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                15050     
_________________________________________________________________
activation (Activation)      (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 44)                2244      
_________________________________________________________________
activation_1 (Activation)    (None, 44)                0         
Total params: 4,569,494
Trainable params: 4,569,494
Non-trainable params: 0
______________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f38acecc668>

In [8]:
def evaluate_model(model, id2word, x_test, y_test):
  """
  Evaluates the given model by computing the accuracy of its predictions
  on the given test data and prints out 10 most mistagged words.
  """
  _, acc = model.evaluate(x_test, y_test) # get accuracy of the model
  print("Accuracy: %.2f" % acc)


  # the following lines are used to get most commonly misstagged words
  y_pred = model.predict_classes(x_test) # get model predictions
  error_counter = collections.Counter()  # we will use a counter instance to count model's erros

  for i in range(len(x_test)):
    correct_tag_id = np.argmax(y_test[i]) # turn a one-hot encoding to an index
    if y_pred[i] != correct_tag_id:
      word = id2word[x_test[i]]
      error_counter[word] += 1

  print("Most commen errors:\n", error_counter.most_common(10))

id2word = sorted(word2id, key=word2id.get)
evaluate_model(pos_model, id2word, X_test, Y_test)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Accuracy: 0.80
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report t

In [0]:
EOS_INDEX = 1
EOS_TOKEN = "EOS"

# creating a random end-of-sequence vector
eos_vector = np.random.standard_normal(EMB_DIM)
embedding_matrix, word2id = add_new_word(EOS_TOKEN, eos_vector, EOS_INDEX, embedding_matrix, word2id)

In [0]:
CONTEXT_SIZE = 2 # define the size of the context-window

def get_window_int_data(tagged_words, word2id, tag2id):
  """
  Replaces all words and tags with their corresponding ids and
  generates an array of label ids Y and the traing data X which
  consists of arrays of word indexes (of tagged word and its context).
  """
  X, Y = [],[]
  unk_count = 0

  span = 2 * CONTEXT_SIZE + 1 # the complete span of the sliding window -> [ window traget window]
  buffer = collections.deque(maxlen=span)
  padding = [(EOS_TOKEN, None)] * CONTEXT_SIZE
  buffer += padding + tagged_words[:CONTEXT_SIZE]

  for item in (tagged_words[CONTEXT_SIZE:] + padding):
    buffer.append(item)

    # the input to the model is the ids of all words in the window
    window_ids = np.array([word2id.get(word) if (word in word2id) else UNK_INDEX for (word, _) in buffer])

    X.append(window_ids)

    # the label is the tag of the middle word
    middle_word, middle_tag = buffer[CONTEXT_SIZE]
    Y.append(tag2id.get(middle_tag))

    if middle_word not in word2id:
      unk_count += 1

  print("Data Created, Percentage of unknown words: %.3f" % (unk_count/len(tagged_words)))
  return np.array(X), np.array(Y)

In [0]:
def define_context_sensitive_model(embedding_matrix, class_count):
  """
  Create and returns a part-of-speech model, which
  takes as input a tagged word and its context.
  """

  vocab_length = len(embedding_matrix)
  total_span = CONTEXT_SIZE * 2 +1 

  model = Sequential()
  model.add(Embedding(input_dim=vocab_length,
                       output_dim=EMB_DIM,
                       weights=[embedding_matrix],
                       input_length=total_span)), # <----

  model.add(Flatten())
  model.add(Dense(HIDDEN_SIZE))
  model.add(Activation("tanh"))
  model.add(Dense(class_count))
  model.add(Activation("softmax"))

  model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss="categorical_crossentropy",
                metrics=["accuracy"])
  return model



In [0]:
def evaluate_model_ext(model, id2word, x_test, y_test):
  """
  Evaluates the given model by computing the accuracy of its predictions
  on the given test data and prints out 10 most mistagged words.
  """
  _, acc = model.evaluate(x_test, y_test) # get accuracy of the model
  print("Accuracy: %.2f" % acc)


  # the following lines are used to get most commonly misstagged words
  y_pred = model.predict_classes(x_test) # get model predictions
  error_counter = collections.Counter()  # we will use a counter instance to count model's erros

  for i in range(len(x_test)):
    correct_tag_id = np.argmax(y_test[i]) # turn a one-hot encoding to an index
    if y_pred[i] != correct_tag_id:       # sic! y_pred <=> y_new
      if isinstance(x_test[i], np.ndarray): 
        word = id2word[x_test[i][CONTEXT_SIZE]]
      else:
        word = id2word[x_test[i]]
      error_counter[word] += 1
  
  print("Most commen errors:\n", error_counter.most_common(10))

In [17]:
X_train2, Y_train2 = get_window_int_data(train_words, word2id, tag2id)
X_test2, Y_test2 = get_window_int_data(test_words, word2id, tag2id)
Y_train2, Y_test2 = to_categorical(Y_train2), to_categorical(Y_test2)


cs_pos_model = define_context_sensitive_model(embedding_matrix, len(tag2id))
cs_pos_model.fit(X_train2,
                 Y_train2,
                 batch_size=BATCH_SIZE,
                 epochs=1,
                 verbose=1)

evaluate_model_ext(cs_pos_model, id2word, X_test2, Y_test2)

Data Created, Percentage of unknown words: 0.143
Data Created, Percentage of unknown words: 0.149
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Accuracy: 0.91
Please report this to the TensorFlow te