# word2vec + keras Tutorial

Code taken from https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights


To see only word2vec: https://rare-technologies.com/word2vec-tutorial/

In [2]:
# import modules & set up logging
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

2018-04-30 14:37:14,918 : INFO : collecting all words and their counts
2018-04-30 14:37:14,921 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-30 14:37:14,923 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences
2018-04-30 14:37:14,925 : INFO : Loading a fresh vocabulary
2018-04-30 14:37:14,925 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)
2018-04-30 14:37:14,926 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)
2018-04-30 14:37:14,927 : INFO : deleting the raw counts dictionary of 3 items
2018-04-30 14:37:14,928 : INFO : sample=0.001 downsamples 3 most-common words
2018-04-30 14:37:14,929 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)
2018-04-30 14:37:14,929 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes
2018-04-30 14:37:14,931 : INFO : resetting layer weights
2018-04-30 14:37:14,933 : INFO : training model with 3 workers o

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [25]:
t = """Foo bar baz hola
luego viene el mono que te pega
y ahi se arma el bardo con la abuela
que corre y corre hasta que consigue la chancleta"""

sentences = t.split('\n')
#sentences

tokenizer = Tokenizer(num_words=30, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True, 
                      char_level=False, split=' ')
tokenizer.fit_on_texts(sentences)

sequences = tokenizer.texts_to_sequences(sentences)
print(tokenizer.word_index.items())

padded = pad_sequences(sequences)
padded


dict_items([('arma', 17), ('viene', 11), ('te', 13), ('abuela', 20), ('ahi', 15), ('bardo', 18), ('luego', 10), ('se', 16), ('hola', 9), ('chancleta', 23), ('corre', 5), ('bar', 7), ('baz', 8), ('hasta', 21), ('la', 4), ('consigue', 22), ('pega', 14), ('que', 1), ('foo', 6), ('mono', 12), ('el', 2), ('y', 3), ('con', 19)])


array([[ 0,  0,  0,  0,  0,  6,  7,  8,  9],
       [ 0,  0, 10, 11,  2, 12,  1, 13, 14],
       [ 3, 15, 16, 17,  2, 18, 19,  4, 20],
       [ 1,  5,  3,  5, 21,  1, 22,  4, 23]], dtype=int32)

In [26]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('../../datasets/wordembed/GoogleNews-vectors-negative300.bin.gz', binary=True)


2018-04-30 16:27:55,426 : INFO : loading projection weights from ../../datasets/wordembed/GoogleNews-vectors-negative300.bin.gz
2018-04-30 16:30:15,943 : INFO : loaded (3000000, 300) matrix from ../../datasets/wordembed/GoogleNews-vectors-negative300.bin.gz


In [28]:
embed = word_vectors.get_keras_embedding()

In [32]:
embed.input_length = 60
embed.get_config()

{'activity_regularizer': None,
 'batch_input_shape': (None, None),
 'dtype': 'float32',
 'embeddings_constraint': None,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'maxval': 0.05, 'minval': -0.05, 'seed': None}},
 'embeddings_regularizer': None,
 'input_dim': 3000000,
 'input_length': 60,
 'mask_zero': False,
 'name': 'embedding_1',
 'output_dim': 300,
 'trainable': False}