In [1]:
import os, numpy as np
import gensim
from pprint import pprint

POS_TRAIN='train.txt'
NUM_TRAIN=1500000

EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.95
MAX_SEQUENCE_LENGTH=4
MAX_NUM_WORDS=1000000
RANDOMIZE_DATA=True

# "unigram", "bigram", "trigram"
TRAINABLE_EMBEDDINGS=True


In [2]:
def create_dl_data(dataset):

    words = []
    labels = []
    
    for item in dataset:

        if (len(words))>NUM_TRAIN:
            break
  
        words.append(item[0]) ## the 4 terms
    
        # we want to use numerical label_id
        labels.append(item[3])
    
    print('Found %d, %d words with POS.' % (
        len(words), len(labels) ))
    return words, labels


In [3]:
def load_dataset():
    task_results = []

    SEP_I=4
    dataset = '../datasets/questions_soiaf_doesnt_match.txt'
    doesnt_match_data = open(dataset).readlines()
    
    for line in doesnt_match_data:

        # those are the section (or :end) markers
        if line.startswith(":"):
            task_type = line.strip()
            continue

        ### get information from the input line
        ### input line format is: task-terms :: outlier
        line_list = line.strip().split()

        assert(len(line_list)) == 6

        ## just split up and a assign the input data
        task_terms, difficulty, correct_outlier = line_list[:SEP_I], line_list[SEP_I][2], line_list[SEP_I+1:][0]
        outlier_pos = task_terms.index(correct_outlier)
#         print(task_terms)
#         print(difficulty)
#         print(correct_outlier)
#         print(outlier_pos)
#         print("\n\n")
        task_results.append([" ".join(task_terms), difficulty, correct_outlier, outlier_pos])

    return task_results

In [4]:
def load_embeddings():
    """ create a dict of word -> vector (np float32) """
    embeddings_index = {}
    with open(os.path.join('../models/asoif_fastText.model')) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Training data:: Found %s word vectors.' % len(embeddings_index))
    return embeddings_index


In [5]:

dataset = load_dataset()
embeddings_index = load_embeddings()
words, labels = create_dl_data(dataset)


Training data:: Found 11035 word vectors.
Found 11180, 11180 words with POS.


In [6]:
import os, sys, numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model


Using TensorFlow backend.


In [7]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer.fit_on_texts(words)
token_sequences = tokenizer.texts_to_sequences(words)

## wohlg: ok, we want to add the word before the target word to the sequence
# this should increase accuracy, as POS tagging is then based on the target word, AND the previous!

sequences = token_sequences
          
print("Sequences start like this:", sequences[:20], "....\n")
print(words[:20])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# https://faroit.github.io/keras-docs/1.2.0/utils/np_utils/
cat_labels = to_categorical(np.asarray(labels))


print ("\nPadded sequences:\n", data)
print("\nLabels (plain) start like this:", labels[:20])

print('Shape of data tensor:', data.shape)
print('Shape of cat_labels tensor:', cat_labels.shape)


Sequences start like this: [[35, 17, 42, 16], [42, 87, 16, 35], [16, 42, 88, 35], [72, 35, 42, 16], [16, 35, 45, 42], [16, 15, 42, 35], [35, 97, 42, 16], [16, 42, 35, 44], [35, 73, 42, 16], [35, 42, 99, 16], [59, 16, 42, 35], [35, 16, 4, 42], [16, 35, 42, 96], [16, 100, 35, 42], [16, 30, 42, 35], [35, 42, 21, 16], [16, 35, 61, 42], [98, 42, 16, 35], [16, 35, 42, 58], [42, 35, 22, 16]] ....

['Tyrion Tywin Cersei Jaime', 'Cersei Joffrey Jaime Tyrion', 'Jaime Cersei Tommen Tyrion', 'Myrcella Tyrion Cersei Jaime', 'Jaime Tyrion Kevan Cersei', 'Jaime Ned Cersei Tyrion', 'Tyrion Oberyn Cersei Jaime', 'Jaime Cersei Tyrion Balon', 'Tyrion Ramsay Cersei Jaime', 'Tyrion Cersei Doran Jaime', 'Harrenhal Jaime Cersei Tyrion', 'Tyrion Jaime Targaryen Cersei', 'Jaime Tyrion Cersei Craster', 'Jaime Dorne Tyrion Cersei', 'Jaime Trident Cersei Tyrion', 'Tyrion Cersei raven Jaime', 'Jaime Tyrion food Cersei', 'dragon Cersei Jaime Tyrion', 'Jaime Tyrion Cersei sword', 'Cersei Tyrion sun Jaime']
Found 112

In [8]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
print(data.shape)
if RANDOMIZE_DATA:
    np.random.shuffle(indices)
data = data[indices]
cat_labels = cat_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
print(num_validation_samples)


x_train = data[:-num_validation_samples]
y_train = cat_labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = cat_labels[-num_validation_samples:]

# x_val = data[-num_validation_samples:-int(num_validation_samples/2)]
# y_val = cat_labels[-num_validation_samples:-int(num_validation_samples/2)]
# x_test = data[-int(num_validation_samples/2):]
# y_test = cat_labels[-int(num_validation_samples/2):]

print("x_train.shape", x_train.shape)
print("x_val.shape", x_val.shape)
# print("x_test.shape", x_test.shape)

(11180, 4)
10621
x_train.shape (559, 4)
x_val.shape (10621, 4)


In [9]:
print('Preparing embedding matrix.')
print('First we create a zero matrix of (vocab_size, emb_dim)')
print('Then we fill it up with the embedding vectors\n\n')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
print("Number of words in voc:", num_words)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
print("Empty shape of embedding_matrix", embedding_matrix.shape)

for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print('Done, shape of embedding_matrix', embedding_matrix.shape)

Preparing embedding matrix.
First we create a zero matrix of (vocab_size, emb_dim)
Then we fill it up with the embedding vectors


Number of words in voc: 113
Empty shape of embedding_matrix (113, 300)
Done, shape of embedding_matrix (113, 300)


In [10]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH+1,
                            trainable=TRAINABLE_EMBEDDINGS)

print('Embedding layer created for %d words, %d dimensions.' % (
        num_words, EMBEDDING_DIM))



Embedding layer created for 113 words, 300 dimensions.


In [11]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
print("shape of embedded_sequences", embedded_sequences.shape)

x = Conv1D(128, MAX_SEQUENCE_LENGTH, activation='relu')(embedded_sequences)
x = MaxPooling1D(1)(x)
x = Conv1D(128, 1, activation='relu')(x)
x = MaxPooling1D(1)(x)
x = Conv1D(128, 1, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x) ## using dropout leads to same acc with training data, but better on val -- yay
x = Dense(128, activation='relu')(x)

preds = Dense(MAX_SEQUENCE_LENGTH, activation='softmax')(x)
#preds = Dense(len(labels_index), activation='softmax')(embedded_sequences)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

from keras.callbacks import TensorBoard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)

model.fit(x_train, y_train,
          batch_size=64,
          epochs=40,
          validation_data=(x_val, y_val),
          callbacks=[tensorBoardCallback])
model.summary()


Training model.
shape of embedded_sequences (?, 4, 300)
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 559 samples, validate on 10621 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 300)            33900     
____________

In [12]:
model.summary()

score = model.evaluate(x_val, y_val, batch_size=128)
#score = model.evaluate(x_test, y_test, batch_size=128)

print('\n',score)

from keras.utils import plot_model
plot_model(model, to_file='model.png')

predictions = model.predict(x_val, batch_size=1, verbose=1, steps=None)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 300)            33900     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2, 128)            153728    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2, 128)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2, 128)            16512     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2, 128)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2, 128)            16512     
__________

In [13]:
cl_predictions = np.argmax(predictions, axis=1)
index_to_word = {val:key for (key, val) in word_index.items()}
#print(word_index)

print("total number of predictions:", len(cl_predictions))

for i in range(len(cl_predictions)):
    gold_pos = np.argmax(y_val[i])
    #gold_pos = np.argmax(y_test[i])
    
    #print("\nIndex position:", x_val[i][0])
    if gold_pos != cl_predictions[i]:
        print(x_val[i], cl_predictions[i], gold_pos, "\t\t", 
          index_to_word[x_val[i][0]],
          index_to_word[x_val[i][1]],
          index_to_word[x_val[i][2]],
          index_to_word[x_val[i][3]],)

#       #print("\nIndex position:", x_val[i][0])
#     if gold_pos != cl_predictions[i]:
#         print(x_test[i], cl_predictions[i], gold_pos, "\t\t", 
#           index_to_word[x_test[i][0]],
#           index_to_word[x_test[i][1]],
#           index_to_word[x_test[i][2]],
#           index_to_word[x_test[i][3]],) 



total number of predictions: 10621
[42  4 50 51] 0 1 		 cersei targaryen joanna lancel
[46 55 18 81] 2 3 		 myr tyros lys direwolve
[57 66 74 62] 0 3 		 rhaegal viserion drogon daenerys
[ 5 29 49 54] 1 0 		 lannister theon euron victarion
[18 56 60 55] 3 2 		 lys norvos oldtown tyros
[ 4 27 26 15] 3 0 		 targaryen robb sansa ned
[27  3 26 31] 0 1 		 robb baratheon sansa catelyn
[ 4 18 47 56] 1 0 		 targaryen lys pentos norvos
[18  2 46 19] 0 1 		 lys bolton myr braavos
[60 78 20 30] 2 0 		 oldtown rhoyne mander trident
[17  6 16 45] 0 1 		 tywin stark jaime kevan
[31 26 27 25] 1 0 		 catelyn sansa robb bran
[34 14 15 35] 1 3 		 rickon arya ned tyrion
[49  3 52 44] 2 1 		 euron baratheon quellon balon
[ 8  7  1 18] 2 3 		 umber mormont greyjoy lys
[51 50 35 15] 2 3 		 lancel joanna tyrion ned
[50 16  3 17] 1 2 		 joanna jaime baratheon tywin
[24 73 58 76] 0 2 		 jon ramsay sword gendry
[31 27 25  3] 2 3 		 catelyn robb bran baratheon
[ 6 12 14  4] 0 2 		 stark karstark arya targaryen
[1

[14 16 35 45] 1 0 		 arya jaime tyrion kevan
[ 2  3  9 19] 1 3 		 bolton baratheon florent braavos
[35 45  2 16] 3 2 		 tyrion kevan bolton jaime
[52 49  6 43] 3 2 		 quellon euron stark asha
[11  9  1 18] 2 3 		 frey florent greyjoy lys
[34 15 31 66] 2 3 		 rickon ned catelyn viserion
[25 15 26  4] 1 3 		 bran ned sansa targaryen
[ 8  4  3 14] 2 3 		 umber targaryen baratheon arya
[70 60 74 77] 1 2 		 gulltown oldtown drogon highgarden
[18 19 60 48] 0 2 		 lys braavos oldtown volantis
[16 35 42 96] 0 3 		 jaime tyrion cersei craster
[20 78 30  2] 2 3 		 mander rhoyne trident bolton
[42 16  2 35] 1 2 		 cersei jaime bolton tyrion
[70 20 30 78] 2 0 		 gulltown mander trident rhoyne
[11  4  2 18] 2 3 		 frey targaryen bolton lys
[24 76 32 15] 0 3 		 jon gendry mya ned
[ 4 13 16 10] 0 2 		 targaryen martell jaime tyrell
[57 74 66 35] 0 3 		 rhaegal drogon viserion tyrion
[52 49 29  5] 2 3 		 quellon euron theon lannister
[15  3 33 14] 3 1 		 ned baratheon lyanna arya
[17 14 50 35] 0 1 		 

[15  5 26 34] 0 1 		 ned lannister sansa rickon
[52 29  2 49] 1 2 		 quellon theon bolton euron
[29 53  4 43] 0 2 		 theon aeron targaryen asha
[15 60 25 27] 0 1 		 ned oldtown bran robb
[26 27 15 72] 2 3 		 sansa robb ned myrcella
[31 34 15 29] 2 3 		 catelyn rickon ned theon
[14 17 42 35] 1 0 		 arya tywin cersei tyrion
[31 15  2 25] 1 2 		 catelyn ned bolton bran
[15  2 27 14] 0 1 		 ned bolton robb arya
[ 3  4 11 19] 1 3 		 baratheon targaryen frey braavos
[10  1  3 14] 2 3 		 tyrell greyjoy baratheon arya
[51 42 16  6] 2 3 		 lancel cersei jaime stark
[13  1 12 19] 1 3 		 martell greyjoy karstark braavos
[75 66 67 74] 0 2 		 balerion viserion arys drogon
[45  6 35 50] 2 1 		 kevan stark tyrion joanna
[ 9  6  1 18] 2 3 		 florent stark greyjoy lys
[ 4 34 33 14] 3 0 		 targaryen rickon lyanna arya
[16  6 35 45] 0 1 		 jaime stark tyrion kevan
[15  4 34 14] 0 1 		 ned targaryen rickon arya
[ 4 42 50 17] 3 0 		 targaryen cersei joanna tywin
[ 1  5  3 19] 2 3 		 greyjoy lannister barat

[74 57 62 75] 1 2 		 drogon rhaegal daenerys balerion
[31 25  5 27] 3 2 		 catelyn bran lannister robb
[52 53  6 49] 3 2 		 quellon aeron stark euron
[15 34  2 27] 0 2 		 ned rickon bolton robb
[44  4 29 49] 2 1 		 balon targaryen theon euron
[47 56 78 19] 3 2 		 pentos norvos rhoyne braavos
[14  3 25 34] 0 1 		 arya baratheon bran rickon
[34 15 14  5] 1 3 		 rickon ned arya lannister
[19  3 11  6] 1 0 		 braavos baratheon frey stark
[32 73 24 17] 0 3 		 mya ramsay jon tywin
[ 4  7 14 10] 0 2 		 targaryen mormont arya tyrell
[ 3 42 35 17] 3 0 		 baratheon cersei tyrion tywin
[42 29 17 50] 2 1 		 cersei theon tywin joanna
[26 34 14  4] 2 3 		 sansa rickon arya targaryen
[84 55 19 18] 3 0 		 westeros tyros braavos lys
[55  4 46 18] 3 1 		 tyros targaryen myr lys
[27 25  1 15] 3 2 		 robb bran greyjoy ned
[29 51 35 42] 3 0 		 theon lancel tyrion cersei
[45 16 50  2] 1 3 		 kevan jaime joanna bolton
[ 5 18 48 46] 1 0 		 lannister lys volantis myr
[13  4  1 19] 2 3 		 martell targaryen grey

[87 27 72 88] 0 1 		 joffrey robb myrcella tommen
[ 3 46 19 56] 2 0 		 baratheon myr braavos norvos
[ 2 43 54 52] 2 0 		 bolton asha victarion quellon
[14 29 52 53] 1 0 		 arya theon quellon aeron
[47 48 19  2] 2 3 		 pentos volantis braavos bolton
[31  2 34 14] 3 1 		 catelyn bolton rickon arya
[ 2 34 25 15] 3 0 		 bolton rickon bran ned
[77  3 71 70] 2 1 		 highgarden baratheon lannisport gulltown
[66 75 74 63] 0 3 		 viserion balerion drogon rhaegar
[26  1 31 14] 3 1 		 sansa greyjoy catelyn arya
[81 72 88 87] 3 0 		 direwolve myrcella tommen joffrey
[53 44 43  3] 1 3 		 aeron balon asha baratheon
[ 2 18 47 56] 1 0 		 bolton lys pentos norvos
[25  4 26 14] 3 1 		 bran targaryen sansa arya
[57 76 32 24] 3 0 		 rhaegal gendry mya jon
[49  5 44 54] 2 1 		 euron lannister balon victarion
[56 48 46  4] 0 3 		 norvos volantis myr targaryen
[35 51 16  4] 0 3 		 tyrion lancel jaime targaryen
[19 55 47  4] 0 3 		 braavos tyros pentos targaryen
[ 2  3  4 18] 2 3 		 bolton baratheon targaryen 

[16 17 50 15] 0 3 		 jaime tywin joanna ned
[ 3 13 14 10] 0 2 		 baratheon martell arya tyrell
[26 33 15 57] 2 3 		 sansa lyanna ned rhaegal
[ 3 15 31 25] 1 0 		 baratheon ned catelyn bran
[45 35  1 16] 3 2 		 kevan tyrion greyjoy jaime
[27 26 35 33] 3 2 		 robb sansa tyrion lyanna
[47 71 46 48] 0 1 		 pentos lannisport myr volantis
[ 2 33 25 15] 3 0 		 bolton lyanna bran ned
[ 1 66 75 74] 1 0 		 greyjoy viserion balerion drogon
[25 26  1 33] 0 2 		 bran sansa greyjoy lyanna
[77 71 70 57] 2 3 		 highgarden lannisport gulltown rhaegal
[11  3  1 15] 2 3 		 frey baratheon greyjoy ned
[ 4 53 29 52] 2 0 		 targaryen aeron theon quellon
[54 52 29  2] 2 3 		 victarion quellon theon bolton
[12  3  4 15] 2 3 		 karstark baratheon targaryen ned
[15  3 14 34] 0 1 		 ned baratheon arya rickon
[19 56 46  2] 0 3 		 braavos norvos myr bolton
[15 31 29 26] 0 2 		 ned catelyn theon sansa
[17 42 51 14] 0 3 		 tywin cersei lancel arya
[17 45  6 35] 0 2 		 tywin kevan stark tyrion
[16 45 68 42] 3 2 		 jai

In [14]:
FN='test.h5'
model.save(FN)
del(model)

from keras.models import load_model
model = load_model(FN)


model.summary()

score = model.evaluate(x_val, y_val, batch_size=128)
# score = model.evaluate(x_test, y_test, batch_size=128)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 300)            33900     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2, 128)            153728    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2, 128)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2, 128)            16512     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2, 128)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2, 128)            16512     
__________