In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation

DICTIONARYSIZE = 100
SKIPWINDOW = 2
DATATEXT = 'ham.txt'

In [75]:
model = Sequential()
model.add(Dense(input_dim=DICTIONARYSIZE, units = DICTIONARYSIZE//40, activation = "relu"))
model.add(Dense(units = DICTIONARYSIZE, activation = "softmax"))

sgd = SGD(lr = 0.1)
model.compile(loss="categorical_crossentropy",
             optimizer = sgd,
             metrics = ['accuracy'])

In [3]:
#Clean data set
def clean(lines):
    ret = []
    count = {}
    for line in lines:
        line = line.strip()
        split = line.split()
        if(len(split) == 0): continue
        #This is specific to the Hamlet data set. Add code for lins to disregard
        if(split[0].isupper()): continue
        for word in split:
            word = word.lower()
            if word[-1] in punctuation:
                word = word[:-1]
            if len(word) == 0: continue
            if word[0] in punctuation:
                word = word[1:]
            if len(word) == 0: continue
            ret.append(word)
            if word in count.keys():
                count[word] += 1
            else:
                count[word] = 1
            
    return ret, count

lines = open(DATATEXT,'r').readlines()
wordList, count = clean(lines)

indexToWord = list(set(wordList))
print(indexToWord[:10])

#Construct word-to-index dictionary
wordToIndex = {indexToWord[i]:i for i in range(len(indexToWord))}

['favour', 'fingers', "utter'd", 'roses', 'return', 'tend', 'us--thou', 'shook', "where's", 'hush']


In [4]:
#(Duplication test)
for i in range(-100,-1):
    if (i%len(indexToWord)) != wordToIndex[indexToWord[i]]:
        print(i)

In [48]:
#Construct word-to-vector (one-hot) and vector-to-word (one-hot) dictionary
#The "hot" one will be indexed by popularity
wordToVector = {}
zeroVector = [0 for _ in range(DICTIONARYSIZE)]
#First sort word-to-index by word popularity
wordsByPopularityDoubles = sorted(count.items(), key = lambda x:x[1], reverse = True)
wordsByPopularity = [wordsByPopularityDoubles[i][0] for i in range(len(wordsByPopularityDoubles))]

for i in range(DICTIONARYSIZE):
    vec = zeroVector.copy()
    vec[i] = 1
    wordToVector[wordsByPopularity[i]] = vec

In [24]:
print("Most popular words:\n" + str(wordsByPopularity[:19]))
print(str(len(wordsByPopularity)) + " words total.")

Most popular words:
['the', 'and', 'to', 'of', 'you', 'my', 'a', 'in', 'it', 'i', 'that', 'is', 'his', 'not', 'this', 'with', 'but', 'for', 'your']
4719 words total.


In [34]:
#Create training arrays (Skip-gram)
#Currently creates the same batch every call
#May be stochastic in the future
def generate_batch():
    x_train = []
    y_train = []
    #Start the moving window. Windex is the index of the center of the window.
    #OPTIMIZE: Use np arrays
    #OPTIMIZE: Implement mini-batch features shown in the word2vec tutorial
    for windex in range(SKIPWINDOW, len(wordList) - SKIPWINDOW):
        centerWord = wordList[windex]
        #OPTIMIZE replace this lookup using a nice dictionary OOH OR A NUMPY ARRAY BOOLEAN THING
        if not (centerWord in wordToVector.keys()): continue
        for contex in range(-SKIPWINDOW, SKIPWINDOW):
            contextWord = wordList[contex]
            #Skip the center word (it is a skip-gram after all...)
            if contex == windex: continue
            #OPTIMIZE replace this lookup using a nice dictionary
            if not (contextWord in wordToVector.keys()): continue
            x_train.append(wordToVector[wordList[windex]])
            y_train.append(wordToVector[wordList[contex]])
    return np.array(x_train), np.array(y_train)

In [35]:
x_train, y_train = generate_batch()
model.fit(x_train, y_train, epochs = 2000, batch_size = len(x_train))

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

Epoch 81/2000
Epoch 82/2000
Epoch 83/2000
Epoch 84/2000
Epoch 85/2000
Epoch 86/2000
Epoch 87/2000
Epoch 88/2000
Epoch 89/2000
Epoch 90/2000
Epoch 91/2000
Epoch 92/2000
Epoch 93/2000
Epoch 94/2000
Epoch 95/2000
Epoch 96/2000
Epoch 97/2000
Epoch 98/2000
Epoch 99/2000
Epoch 100/2000
Epoch 101/2000
Epoch 102/2000
Epoch 103/2000
Epoch 104/2000
Epoch 105/2000
Epoch 106/2000
Epoch 107/2000
Epoch 108/2000
Epoch 109/2000
Epoch 110/2000
Epoch 111/2000
Epoch 112/2000
Epoch 113/2000
Epoch 114/2000
Epoch 115/2000
Epoch 116/2000
Epoch 117/2000
Epoch 118/2000
Epoch 119/2000
Epoch 120/2000
Epoch 121/2000
Epoch 122/2000
Epoch 123/2000
Epoch 124/2000
Epoch 125/2000
Epoch 126/2000
Epoch 127/2000
Epoch 128/2000
Epoch 129/2000
Epoch 130/2000
Epoch 131/2000
Epoch 132/2000
Epoch 133/2000
Epoch 134/2000
Epoch 135/2000
Epoch 136/2000
Epoch 137/2000
Epoch 138/2000
Epoch 139/2000
Epoch 140/2000
Epoch 141/2000
Epoch 142/2000
Epoch 143/2000
Epoch 144/2000
Epoch 145/2000
Epoch 146/2000
Epoch 147/2000
Epoch 148/2000

Epoch 160/2000
Epoch 161/2000
Epoch 162/2000
Epoch 163/2000
Epoch 164/2000
Epoch 165/2000
Epoch 166/2000
Epoch 167/2000
Epoch 168/2000
Epoch 169/2000
Epoch 170/2000
Epoch 171/2000
Epoch 172/2000
Epoch 173/2000
Epoch 174/2000
Epoch 175/2000
Epoch 176/2000
Epoch 177/2000
Epoch 178/2000
Epoch 179/2000
Epoch 180/2000
Epoch 181/2000
Epoch 182/2000
Epoch 183/2000
Epoch 184/2000
Epoch 185/2000
Epoch 186/2000
Epoch 187/2000
Epoch 188/2000
Epoch 189/2000
Epoch 190/2000
Epoch 191/2000
Epoch 192/2000
Epoch 193/2000
Epoch 194/2000
Epoch 195/2000
Epoch 196/2000
Epoch 197/2000
Epoch 198/2000
Epoch 199/2000
Epoch 200/2000
Epoch 201/2000
Epoch 202/2000
Epoch 203/2000
Epoch 204/2000
Epoch 205/2000
Epoch 206/2000
Epoch 207/2000
Epoch 208/2000
Epoch 209/2000
Epoch 210/2000
Epoch 211/2000
Epoch 212/2000
Epoch 213/2000
Epoch 214/2000
Epoch 215/2000
Epoch 216/2000
Epoch 217/2000
Epoch 218/2000
Epoch 219/2000
Epoch 220/2000
Epoch 221/2000
Epoch 222/2000
Epoch 223/2000
Epoch 224/2000
Epoch 225/2000
Epoch 226/

Epoch 239/2000
Epoch 240/2000
Epoch 241/2000
Epoch 242/2000
Epoch 243/2000
Epoch 244/2000
Epoch 245/2000
Epoch 246/2000
Epoch 247/2000
Epoch 248/2000
Epoch 249/2000
Epoch 250/2000
Epoch 251/2000
Epoch 252/2000
Epoch 253/2000
Epoch 254/2000
Epoch 255/2000
Epoch 256/2000
Epoch 257/2000
Epoch 258/2000
Epoch 259/2000
Epoch 260/2000
Epoch 261/2000
Epoch 262/2000
Epoch 263/2000
Epoch 264/2000
Epoch 265/2000
Epoch 266/2000
Epoch 267/2000
Epoch 268/2000
Epoch 269/2000
Epoch 270/2000
Epoch 271/2000
Epoch 272/2000
Epoch 273/2000
Epoch 274/2000
Epoch 275/2000
Epoch 276/2000
Epoch 277/2000
Epoch 278/2000
Epoch 279/2000
Epoch 280/2000
Epoch 281/2000
Epoch 282/2000
Epoch 283/2000
Epoch 284/2000
Epoch 285/2000
Epoch 286/2000
Epoch 287/2000
Epoch 288/2000
Epoch 289/2000
Epoch 290/2000
Epoch 291/2000
Epoch 292/2000


KeyboardInterrupt: 

In [74]:
for i in range(DICTIONARYSIZE):
    a = model.predict(np.array([wordToVector[wordsByPopularity[i]]]))[0]
    printConfidence(a)

98.5%: there
98.4%: there
98.30000000000001%: there
98.30000000000001%: there
97.4%: there
98.0%: there
98.0%: there
97.7%: there
97.4%: there
97.7%: there
97.60000000000001%: there
97.4%: there
97.60000000000001%: there
97.7%: there
98.10000000000001%: there
98.30000000000001%: there
98.4%: there
98.10000000000001%: there
98.10000000000001%: there
97.9%: there
98.2%: there
97.9%: there
97.60000000000001%: there
97.4%: there
97.60000000000001%: there
98.30000000000001%: there
98.10000000000001%: there
97.80000000000001%: there
97.80000000000001%: there
98.30000000000001%: there
97.5%: there
97.80000000000001%: there
98.0%: there
97.4%: there
98.30000000000001%: there
97.4%: there
97.30000000000001%: there
97.80000000000001%: there
97.80000000000001%: there
98.2%: there
97.4%: there
97.60000000000001%: there
98.2%: there
97.60000000000001%: there
98.30000000000001%: there
97.4%: there
97.60000000000001%: there
97.9%: there
98.30000000000001%: there
97.5%: there
97.9%: there
98.0%: there

In [63]:
def printConfidence(prediction):
    for i,confidence in enumerate(prediction):
        if confidence > 0.01:
            print(str((confidence * 100) - (confidence * 100)%0.1) + "%: " + wordsByPopularity[i])