## Notebook on word2vec
The first step to create Skipgram is to preprocess the data such that it has the correct shape. We create two functions that preprocess the data of the Alice in Wonderland textbook, such that they can be used to train Skipgram. 


In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from itertools import islice
from matplotlib import pylab
from __future__ import division

Using TensorFlow backend.


In [2]:
# DO NOT Modify the lines in this cell
path = 'alice.txt'
corpus = open(path).readlines()[0:700]

corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1

# Is this something they need to change?
dim = 100
window_size = 2
window_size_corpus = 4
print(corpus)

[[242, 6, 26, 1, 63, 243], [11, 9, 584, 3, 67, 27, 244, 8, 585, 71, 14, 380, 21, 1], [381, 2, 8, 245, 112, 3, 49, 98, 57, 586, 4, 17, 587, 72, 1], [205, 14, 380, 9, 588, 19, 7, 17, 50, 246, 57, 589, 10], [7, 2, 44, 33, 1, 152, 8, 5, 205, 53, 11, 174, 246, 57], [30, 4, 9, 382, 10, 14, 247, 248, 15, 127, 15, 4, 58, 18, 1], [249, 250, 206, 14, 383, 27, 384, 2, 385, 386, 1, 590], [8, 387, 5, 591, 592, 59, 20, 388, 1, 389, 8, 138, 43, 2], [593, 1, 594, 46, 175, 5, 128, 63, 23, 595, 176, 207], [295, 71, 14], [55, 9, 112, 30, 27, 390, 10, 13, 596, 99, 11, 60, 7, 30], [27, 91, 31, 8, 1, 45, 3, 296, 1, 63, 83, 3, 297, 51, 64], [51, 64, 6, 92, 20, 298, 46, 4, 53, 7, 100, 597, 7], [598, 3, 14, 13, 4, 299, 3, 84, 599, 34, 28, 19, 34, 1, 66], [7, 22, 113, 78, 391, 19, 46, 1, 63, 600, 139, 5, 392], [31, 8, 114, 393, 251, 2, 129, 34, 7, 2, 79, 252, 21], [11, 394, 3, 14, 101, 18, 7, 601, 395, 14, 248, 13, 4, 17], [115, 130, 208, 5, 63, 23, 140, 5, 393, 251, 57, 5, 392], [3, 253, 31, 8, 7, 2, 602, 23, 

In [3]:
#generate data for Skipgram
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size*2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1
                    
            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    in_words.append([word])
                    labels.append(words[i])
            if in_words != []:
                all_in.append(np.array(in_words,dtype=np.int32))
                all_out.append(np_utils.to_categorical(labels, V))
    return (all_in,all_out)

In [4]:
#get x and y's for data
x,y = generate_data_skipgram(corpus,window_size,V)

In [5]:
#save the preprocessed data of Skipgram
f = open('data_skipgram.txt' ,'w')

for input,outcome  in zip(x,y):
    input = np.concatenate(input)
    f.write(" ".join(map(str, list(input))))
    f.write(",")
    outcome = np.concatenate(outcome)
    f.write(" ".join(map(str,list(outcome))))
    f.write("\n")
f.close()

In [6]:
#load the preprocessed Skipgram data
def generate_data_skipgram_from_file():
    f = open('data_skipgram.txt' ,'r')
    for row in f:
        inputs,outputs = row.split(",")
        inputs = np.fromstring(inputs, dtype=int, sep=' ')
        inputs = np.asarray(np.split(inputs, len(inputs)))
        outputs = np.fromstring(outputs, dtype=float, sep=' ')
        outputs = np.asarray(np.split(outputs, len(inputs)))
        yield (inputs,outputs)

In [7]:
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))

In [8]:
skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [9]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data_skipgram_from_file():
        loss += skipgram.train_on_batch(x, y)

    print(ite, loss)

(0, 42009.742413043976)
(1, 38355.414453744888)
(2, 38929.412898778915)
(3, 39347.192147493362)
(4, 39511.735649347305)
(5, 39678.08170747757)
(6, 39844.510386705399)
(7, 40025.911492109299)
(8, 40216.234758615494)
(9, 40407.229763031006)


In [10]:
f = open('vectors_skipgram.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")

In [11]:
vectors = skipgram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i,:]))))
    f.write("\n")
f.close()

In [None]:
#TEST Analogy?