## Skipgrams in Keras

-  We will implement Skipgrams in `Keras`.

#### Loading in and preprocessing data
- Load the Alice in Wonderland data in Corpus using Keras utility
- `Keras` has some nice text preprocessing features too!
- Split the text into sentences.
- Use `Keras`' `Tokenizer` to tokenize sentences into words.

In [19]:
# Imports
# Basics
from __future__ import print_function, division
import pandas as pd 
import numpy as np
import random
from IPython.display import SVG
%matplotlib inline

# nltk
from nltk import sent_tokenize

# keras
np.random.seed(13)
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Activation
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot 
from keras.preprocessing.sequence import skipgrams
import keras
import pydot as pyd

# keras.utils.vis_utils.pydot = pyd
# #Visualize Model
# def visualize_model(model):
#   return SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [20]:
# We'll use Alice in Wonderland

corpus = open('Alice_In_Wonderland.txt').read()

In [21]:
# Split document into sentences first
corpus = corpus[corpus.index('\n\n')+2:]  # remove header.
sentences = sent_tokenize(corpus)

# Tokenize using Keras
base_filter='!"#$%&()*+,-./:;`<=>?@[\\]^_{|}~\t\n' + "'"
tokenizer = Tokenizer(filters=base_filter)
tokenizer.fit_on_texts(sentences)

# Convert tokenized sentences to sequence format
sequences = tokenizer.texts_to_sequences(sentences)
nb_samples = sum(len(s) for s in corpus)

print(len(sequences), tokenizer.document_count)

1616 1616


In [48]:
# To understand what is happening;

print(sentences[324])  # this is a sentence
print(sequences[324])  # this is the same sentence where words are encoded as numbers.
# print(list(tokenizer.word_index[word.lower().replace('.', '').replace(',', '')] 
#            for word in sentences[324].split()))


And Alice was so much frightened
that she ran off at once in the direction it pointed to, without
trying to explain the mistake it had made.
[2, 11, 13, 27, 93, 508, 14, 6, 230, 61, 18, 135, 12, 1, 659, 5, 1753, 3, 170, 264, 3, 375, 1, 948, 5, 23, 150]
[2, 11, 13, 27, 93, 508, 14, 6, 230, 61, 18, 135, 12, 1, 659, 5, 1753, 3, 170, 264, 3, 375, 1, 948, 5, 23, 150]


#### Skipgrams: Generating Input and Output Labels
- Now that we have sentences, and word tokenization, we are in good position to create our training set for skipgrams.
- Now we need to generate our `X_train` and `y_train`

In [5]:
# Let's first see how Keras' skipgrams function works.

couples, labels = skipgrams(sequences[324], len(tokenizer.word_index) + 1,
    window_size=2, negative_samples=0, shuffle=True,
    categorical=False, sampling_table=None)

index_2_word = {val: key for key, val in tokenizer.word_index.items()}

for w1, w2 in couples:
    if w1 == 13:
        print(index_2_word[w1], index_2_word[w2])

was so
was alice
was and
was much


In [6]:
# Function to generate the inputs and outputs for all windows

# Vocab size
vocab_size = len(tokenizer.word_index) + 1
# Dimension to reduce to
dim = 100
window_size = 2


def generate_data(sequences, window_size, vocab_size):
    for seq in sequences:
        X, y = [], []
        couples, _ = skipgrams(
            seq, vocab_size,
            window_size=window_size, negative_samples=0, shuffle=True,
            categorical=False, sampling_table=None)
        if not couples:
            continue
        for in_word, out_word in couples:
            X.append(in_word)
            y.append(np_utils.to_categorical(out_word, vocab_size))
        X, y = np.array(X), np.array(y)
        X = X.reshape(len(X), 1)
        y = y.reshape(len(X), vocab_size)
        yield X, y
        
data_generator = generate_data(sequences, window_size, vocab_size)

### Skipgrams: Creating the Model
- Lastly, we create the (shallow) network!

In [49]:
# Create the Keras model and view it 
skipgram = Sequential()
skipgram.add(Embedding(input_dim=vocab_size, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim,)))
skipgram.add(Dense(input_dim=dim, units=vocab_size, activation='softmax'))
#SVG(model_to_dot(skipgram, show_shapes=True).create(prog='dot', format='svg'))
# visualize_model(skipgram)
skipgram.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1, 100)            257500    
_________________________________________________________________
reshape_7 (Reshape)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 2575)              260075    
Total params: 517,575
Trainable params: 517,575
Non-trainable params: 0
_________________________________________________________________


### Skipgrams: Compiling and Training
- Time to compile and train
- We use crossentropy, common loss for classification

In [50]:
# Compile the Keras Model
from keras.optimizers import SGD
sgd = SGD(lr=1e-4, decay=1e-6, momentum=0.9)

skipgram.compile(loss='categorical_crossentropy', optimizer="adadelta")

# Fit the Skipgrams
for iteration in range(10):
    loss = 0
    for x, y in generate_data(sequences, window_size, vocab_size):
        loss += skipgram.train_on_batch(x, y)
    print('iteration {}, loss is {}'.format(iteration, loss))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


iteration 0, loss is 11890.672101020813
iteration 1, loss is 10656.24239206314
iteration 2, loss is 10055.822371721268
iteration 3, loss is 9767.855532765388
iteration 4, loss is 9602.982296943665
iteration 5, loss is 9491.485312223434
iteration 6, loss is 9406.656256198883
iteration 7, loss is 9338.097015500069
iteration 8, loss is 9280.84478700161
iteration 9, loss is 9231.705855488777


### Skipgrams: Looking at the vectors

To get word_vectors now, we look at the weights of the first layer.

Let's also write functions giving us similarity of two words.

In [9]:
word_vectors = skipgram.get_weights()[0]


from scipy.spatial.distance import cosine


def get_dist(w1, w2):
    i1, i2 = tokenizer.word_index[w1], tokenizer.word_index[w2]
    v1, v2 = word_vectors[i1], word_vectors[i2]
    return cosine(v1, v2)

def get_similarity(w1, w2):
    return 1-get_dist(w1, w2)

def get_most_similar(w1, n=10):
    sims = {word: get_similarity(w1, word) 
            for word in tokenizer.word_index.keys()
            if word != w1}
    sims = pd.Series(sims)
    sims.sort_values(inplace=True, ascending=False)
    return sims.iloc[:n]


print(get_similarity('king', 'queen'))
print('')
print(get_most_similar('queen'))

0.9347442984580994

gryphon        0.947695
king           0.934744
duchess        0.929979
hatter         0.921407
caterpillar    0.910961
dormouse       0.897540
march          0.890778
first          0.882151
mouse          0.877515
cat            0.874623
dtype: float64


## Your turn -- Modify the code above to create a CBOW Model