In [4]:
from pipe import *

Download a basic text corpus using Keras i.e. the IMDB one

In [5]:
from keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)

In [6]:
review = 200

We can check out what one of the reviews actually says!

In [7]:
word_index = imdb.get_word_index()

reverse_word_index = list( word_index.items() ) \
    | select( lambda i: (i[1],i[0]) ) \
    | as_dict

decoded_review = train_data[review] \
    | select( lambda i: reverse_word_index.get(i - 3, '?') ) \
    | as_list

print( ' '.join(decoded_review) )

? this is a bit long 2 hours 20 minutes but it had a a lot of the famous ? ? novel in it in other words a lot of ? to ? br br it was ? ? at times but had some ? dramatic moments too ? off by a ? ? at the end of the film that was ? to view ? this film is about ? years old the special effects ? on this film did a ? job br br paul ? and ? ? were ? ? actors in their day and they don't ? here both giving powerful performances the only problem is ? as all the ? are played by ? and some of them like ? ? just don't look real i'd like to see a re make of this movie with all ? actors not for ? ? but to simply make the story look and sound more ?


In [8]:
from keras.utils import to_categorical

sentence = to_categorical(train_data[review], num_classes=1000)

sentence.shape

(160, 1000)

In [9]:
sentence

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [69]:
train_labels

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

Concatenate all the reviews together for a contrived corpus

In [10]:
all_reviews = range(25000) \
    | select( lambda i: train_data[0] ) \
    | chain \
    | as_list

In [11]:
import numpy as np
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams

vocab_size = 1000
window_size = 3

sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(all_reviews, vocab_size, window_size=window_size, sampling_table=sampling_table)

word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [40]:
word_context.shape

(3648132,)

In [36]:
from keras.models import Sequential

vocab_size = 1000
embed_size = 300

word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
word_model.add(Reshape((embed_size,)))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
                            embeddings_initializer="glorot_uniform",
                            input_length=1))
context_model.add(Reshape((embed_size,)))

model = Sequential()
model.add(Merge([word_model, context_model], mode="dot", dot_axes=0))
model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))

model.compile(loss="mean_squared_error", optimizer="adam")

merge_layer = model.layers[0]
word_model = merge_layer.layers[0]
word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0]

NameError: name 'Merge' is not defined

In [15]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       300000      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 300, 1)       0           embedding_1[0][0]                
__________

Code to plot the model

In [34]:
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot

svg_code = model_to_dot(validation_model).create(prog='dot', format='svg')

with open("validation_model.svg", "wb") as text_file:
    text_file.write(svg_code)


In [98]:
word_target

array([ 88,  50, 100, ..., 626,  43,  14])

In [95]:
arr_2

array([16.])

In [93]:
loss

0.7042707

In [123]:
valid_window = 1000  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_size = 16     # Random set of words to evaluate similarity on.

print( list(valid_examples) | select( lambda i: reverse_word_index[i-3] ) | as_list )

['whole', 'close', 'they', 'killer', 'book', 'call', 'son', 'white', 'please', 'word', 'hardly', '10', 'novel', 'decent', 'talk', 'side']


In [130]:
get_sim(570)

array([0.05514634, 0.05400386, 0.00447231, 0.05850446, 0.0075467 ,
       0.00666854, 0.02504643, 0.02230966, 0.02370823, 0.02623644,
       0.05946091, 0.06144324, 0.01182596, 0.01926584, 0.02713901,
       0.02297904, 0.01097171, 0.0167491 , 0.02052677, 0.03402446,
       0.05943317, 0.02657021, 0.01547801, 0.06284042, 0.05770507,
       0.01490169, 0.01666608, 0.06153949, 0.03193345, 0.06212477,
       0.02936892, 0.06156151, 0.02777913, 0.01954469, 0.06051388,
       0.02978747, 0.01926054, 0.06079759, 0.01526949, 0.03187932,
       0.05914519, 0.06070565, 0.0590456 , 0.01586687, 0.06007691,
       0.06347959, 0.02991313, 0.06612297, 0.03813794, 0.05921679,
       0.01594413, 0.02717398, 0.03928409, 0.06493407, 0.06029411,
       0.05657085, 0.03994871, 0.05702174, 0.05965512, 0.06459205,
       0.05866748, 0.05224859, 0.02990228, 0.05771722, 0.05565057,
       0.02339513, 0.02571374, 0.05855092, 0.05900454, 0.05953146,
       0.05767694, 0.02109058, 0.06303152, 0.0629799 , 0.06352

In [42]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer, one_hot
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import numpy as np
import operator

np.random.seed(42)

BATCH_SIZE = 128
NUM_EPOCHS = 20

Xtrain, Xtest, Ytrain, Ytest = train_test_split(word_target, word_context, test_size=0.3,random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

(2553692,) (1094440,) (2553692,) (1094440,)


In [81]:
maxlen=200
print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=vocab_size)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')
print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
input_train shape: (25000, 200)
input_test shape: (25000, 200)


In [None]:
def data_generator():
    

In [94]:
from keras.layers import Flatten

vocab_size = 1000

model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=200))
model.add(Flatten())

model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

history = model.fit(input_train, train_labels, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_42 (Embedding)     (None, 200, 32)           32000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 6400)              0         
Total params: 32,000
Trainable params: 32,000
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking target: expected flatten_1 to have shape (6400,) but got array with shape (1,)

In [37]:




# plot loss function
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")

plt.tight_layout()
plt.show()

# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300
# dimensional representation
W, b = model.layers[0].get_weights()

idx2emb = {}
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, W)
    idx2emb[wid] = vec_emb

for word in ["stupid", "alice", "succeeded"]:
    wid = word2idx[word]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(((wid, i),
                         cosine_distances(source_emb, target_emb)))
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))

ModuleNotFoundError: No module named 'nltk'