In [130]:
import numpy as np
import theano
import six.moves.cPickle
import os, re, json
import operator

from keras.preprocessing import sequence, text
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils, generic_utils
from keras.models import Sequential
from keras.layers.embeddings import WordContextProduct, Embedding
from six.moves import range
from six.moves import zip

In [189]:
max_features = 1000
skip_top = 0 # ignore top 0 most common words
nb_epoch = 10
dim_proj = 35 # embedding space dimension

In [100]:
data_path = os.path.expanduser("~/")+"Downloads/sample.txt"

In [164]:
# text preprocessing utils
html_tags = re.compile(r'<.*?>')
to_replace = [('&#x27;', "'")]
hex_tags = re.compile(r'&.*?;')

def clean_comment(comment):
    c = str(comment)
    c = html_tags.sub(' ', c)
    for tag, char in to_replace:
        c = c.replace(tag, char)
    c = hex_tags.sub(' ', c)
    c = c.strip()
    return c

def text_generator(path=data_path):
    f = open(path)
    lines = f.read()
    lines = lines.split(".")
    for line in lines:
        line = clean_comment(line)
        yield line
    f.close()

In [190]:
print("Fit tokenizer...")
tokenizer = text.Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(text_generator())

Fit tokenizer...


In [191]:
print tokenizer.document_count
print tokenizer.word_index

351
{'essay': 350, 'limited': 611, 'all': 80, 'founder': 251, 'impression': 612, 'caused': 613, 'results': 614, 'deliveries': 615, 'existing': 252, 'leads': 443, 'go': 253, 'shot': 905, 'decisions': 616, 'children': 352, 'resourceful': 445, 'seemed': 254, 'increase': 485, 'careful': 998, 'depend': 617, "startup's": 447, 'producing': 716, 'technique': 448, 'young': 619, 'collogic': 620, 'to': 2, 'behave': 750, 'those': 1000, 'determinations': 621, 'under': 622, 'discovering': 451, 'disapproved': 623, 'fatal': 624, 'worth': 290, 'backwant': 625, 'town': 452, 'force': 1017, 'risk': 626, 'advantage': 355, 'permissive': 627, 'very': 130, 'implicitly': 628, 'story': 1096, 'focus': 630, 'every': 376, 'decide': 255, 'advised': 632, 'telling': 453, 'trouble': 633, 'bottleneck': 634, 'ramen': 635, 'method': 930, 'school': 256, 'impressive': 454, 'presented': 455, 'turns': 456, 'list': 233, "they've": 637, 'relentlessly': 638, 'standards': 639, 'large': 131, 'solved': 356, 'phase': 1133, 'small':

In [135]:
x = tokenizer.word_counts
print sorted(x.items(), key=operator.itemgetter(1), reverse=True)

[('the', 485), ('to', 484), ('a', 289), ('of', 265), ('you', 216), ('be', 180), ('that', 161), ('in', 149), ('is', 145), ('and', 125), ('have', 119), ("you're", 110), ('they', 105), ('it', 104), ('not', 92), ('but', 83), ('if', 81), ('for', 77), ('get', 70), ('do', 70), ('them', 69), ('startup', 68), ('more', 67), ("don't", 67), ('are', 61), ('people', 59), ('your', 56), ('startups', 55), ('like', 55), ('so', 51), ('will', 51), ('when', 51), ('good', 50), ("it's", 50), ('as', 48), ('just', 47), ('about', 47), ("they're", 47), ('because', 46), ('who', 45), ('on', 45), ('company', 44), ('can', 44), ('money', 44), ('what', 43), ('one', 43), ('investors', 42), ('by', 41), ('make', 40), ('with', 40), ('want', 38), ('at', 37), ('from', 33), ('something', 33), ('going', 32), ('founders', 31), ('than', 31), ('were', 31), ('seem', 31), ('we', 31), ('i', 31), ('even', 30), ('most', 30), ('say', 29), ('much', 29), ('lot', 29), ('know', 28), ('their', 26), ('someone', 26), ('could', 25), ('first',

In [192]:
print('Build model...')
model = Sequential()
model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
model.compile(loss='mse', optimizer='rmsprop')

Build model...


In [193]:
sampling_table = sequence.make_sampling_table(max_features)

In [194]:
for e in range(nb_epoch):
    print('-'*40)
    print('Epoch', e)
    print('-'*40)

    progbar = generic_utils.Progbar(tokenizer.document_count)
    samples_seen = 0
    losses = []
    
    for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
        # get skipgram couples for one text in the dataset
        couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table)
        if couples:
            # one gradient update per sentence (one sentence = a few 1000s of word couples)
            X = np.array(couples, dtype="int32")
            loss = model.train(X, labels)
            losses.append(loss)
            if len(losses) % 100 == 0:
                progbar.update(i, values=[("loss", np.mean(losses))])
                losses = []
            samples_seen += len(labels)
    print('Samples seen:', samples_seen)
print("Training completed!")

----------------------------------------
('Epoch', 0)
----------------------------------------
----------------------------------------
('Epoch', 1)
----------------------------------------
----------------------------------------
('Epoch', 2)
----------------------------------------
----------------------------------------
('Epoch', 3)
----------------------------------------
----------------------------------------
('Epoch', 4)
----------------------------------------
----------------------------------------
('Epoch', 5)
----------------------------------------
----------------------------------------
('Epoch', 6)
----------------------------------------
----------------------------------------
('Epoch', 7)
----------------------------------------
----------------------------------------
('Epoch', 8)
----------------------------------------
----------------------------------------
('Epoch', 9)
----------------------------------------
Training completed!


In [195]:
# recover the embedding weights trained with skipgram:
weights = model.layers[0].get_weights()[0]

In [196]:
# max_features = 100
# dim_proj = 35 # embedding space dimension
weights.shape

(1000, 35)

In [None]:
# we no longer need this
del model

In [197]:
# weights[:skip_top] = np.zeros((skip_top, dim_proj))
norm_weights = np_utils.normalize(weights)

word_index = tokenizer.word_index
reverse_word_index = dict([(v, k) for k, v in list(word_index.items())])
word_index = tokenizer.word_index

def embed_word(w):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return None
    return norm_weights[i]

def closest_to_point(point, nb_closest=10):
    proximities = np.dot(norm_weights, point)
    tups = list(zip(list(range(len(proximities))), proximities))
    tups.sort(key=lambda x: x[1], reverse=True)
    return [(reverse_word_index.get(t[0]), t[1]) for t in tups[:nb_closest]]  

def closest_to_word(w, nb_closest=10):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return []
    return closest_to_point(norm_weights[i].T, nb_closest)

In [198]:
norm_weights.shape

(1000, 35)

In [201]:
words = ["first", "someone", "very"]

for w in words:
    res = closest_to_word(w)
    print('====', w)
    for r in res:
        print(r)

('====', 'first')
('first', 0.99999999999999978)
('seriously', 0.49790643756829794)
('an', 0.49406799268412138)
('seed', 0.47541329321783088)
('previous', 0.4513666896799422)
('backwant', 0.43745369720771504)
("who's", 0.43460943539188535)
('under', 0.42033294106264929)
('extraordinary', 0.41741061676681934)
('syrelicon', 0.40209655048546522)
('====', 'someone')
('someone', 1.0000000000000002)
("can't", 0.47500323676311418)
('say', 0.46329861592896882)
('new', 0.45458308748423015)
('next', 0.39691060199573591)
('raise', 0.39534702898483132)
('advance', 0.38937610117978871)
('partly', 0.37741839592987986)
('founder', 0.37450721730308095)
('term', 0.37288162180278317)
('====', 'very')
('very', 0.99999999999999978)
('answer', 0.48319412652317617)
('future', 0.43628154602186681)
('goal', 0.43514353022550423)
('roberts', 0.43177315436033947)
('went', 0.42850009557969904)
('existences', 0.41884230963944535)
('offering', 0.41849965587078797)
('slow', 0.41655929049425955)
('convincing', 0.4055

In [188]:
len(word_index)

1186