<a href="https://colab.research.google.com/github/geri-m/word2vec/blob/master/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296
import multiprocessing

import nltk
from gensim.models import Word2Vec
from nltk.corpus import brown

nltk.download('brown')
nltk.download('conll2000')

# Data is processed and Tokenised!
sentences = brown.sents()
print(sentences[:3])

EMB_DIM = 300

w2v = Word2Vec(sentences, size=EMB_DIM, window=5, min_count=5, negative=15, iter=10,
               workers=multiprocessing.cpu_count())

word_vectors = w2v.wv  # get trained embeddings - an KeyedVector instaces
result = word_vectors.similar_by_word("Saturday")
print("Most Similar to 'Saturday': %s" % result[:3])

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'poss

  if np.issubdtype(vec.dtype, np.int):


In [2]:
from nltk.corpus import conll2000
from gensim.models import Word2Vec # https://code.google.com/archive/p/word2vec/
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
import numpy as np
import collections


train_words = conll2000.tagged_words('train.txt')
test_words = conll2000.tagged_words('test.txt')
print(train_words[:20])


[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN')]


In [0]:
def get_tag_vocabulary(tagged_words):
  """
  Accepts text in the form of (word, pos) tuples and returns
  a dictionary mapping POS-tags to unique ids
  """
  tag2id = {}
  for item in tagged_words:
    tag = item[1]
    tag2id.setdefault(tag, len(tag2id))
  return tag2id

# the word_vectors.vocab dictionary stores Vocab objects, rather than integers
# but we woudl like our dictionary to map words to ints
word2id = {k: v.index for k, v in word_vectors.vocab.items()}
tag2id = get_tag_vocabulary(train_words) 

In [12]:
def get_int_data(tagged_words, word2id, tag2id):
  """
  Replaces all words and tags with their corresponding ids and
  separates words (features) from the tags (labels). 
  """
  X, Y = [], [] # X will hold word ids, Y will hold ids of their tags
  unk_count = 0 # to keep track of the number of unkonwn words
                # - words we don't have a representation for

  for word, tag in tagged_words:
    Y.append(tag2id.get(tag))
    if word in word2id:
      X.append(word2id.get(word))
    else:
      X.append(UNK_INDEX) # <---- NEW ADDED!
      unk_count += 1
    print("Data Created. percentag of unkown words: %.3f" % (unk_count/len(tagged_words)))
    return np.array(X), np.array(Y)

X_train, Y_train = get_int_data(train_words, word2id, tag2id)
X_test, Y_test = get_int_data(test_words, word2id, tag2id)

# we need to one-hot encode the tag indexes
Y_train, Y_test = to_categorical(Y_train), to_categorical(Y_test)

Data Created. percentag of unkown words: 0.000
Data Created. percentag of unkown words: 0.000


In [0]:
def add_new_word(new_word, new_vector, new_index, embedding_matrix, word2id):
  """
  Adds a new word to the existing matrix of word embeddings.
  """
  # inserts the vector before given index, along axis 0
  embedding_matrix = np.insert(embedding_matrix, [new_index], [new_vector], axis=0)

  # updating the indexes of words that follow the new word
  word2id = {word: (index + 1) if index >= new_index else index 
             for word, index in word2id.items()}
  word2id[new_word] = new_index
  return embedding_matrix, word2id

UNK_INDEX = 0 # it is generally common to associate UNK with index 0
UNK_TOKEN = "UNK"

embedding_matrix = word_vectors.vectors
unk_vector = embedding_matrix.mean(0)
embedding_matrix, word2id = add_new_word(UNK_TOKEN, unk_vector, UNK_INDEX, embedding_matrix, word2id)



In [0]:
def 