In [1]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import keras.backend as K
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [3]:
#!/usr/bin/env python
# coding: utf-8

#phi - simple LSTM


import csv
def readCSV(fileName,r):
    sent = []
    with open(fileName) as file:
        readcsv = csv.reader(file, delimiter=',')
        for row in readcsv:
            sentence = row[r]
            sent.append(sentence)
    return sent

# In[5]:

neg = readCSV('../../corpora/verified_non_analogies.csv',1)
pos = readCSV('../../corpora/verified_analogies.csv',1)
labels = [1]*len(pos) + [0] * len(neg)
texts = pos + neg

# In[66]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 28 #maximum allowed number of words in a sentence 

max_words = 10000#choosing the most 10000 common words
test = 40 #number of testing samples

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print("Found {} unique tokens".format(len(word_index)))

data = pad_sequences(sequences, maxlen = maxlen)
labels = np.asarray(labels)

indices = np.arange(data.shape[0])

np.random.shuffle(indices)

data = data[indices]
labels = labels[indices]

train_data = data[:len(data)-test]
test_data = data[len(data)-test:]
train_targets = labels[:len(data) - test]
test_targets = labels[len(data)-test:]

print("Shape of data ", data.shape)
print("Shape of label", labels.shape)
print("Shape of train_data", train_data.shape)
print("Shape of test_data", test_data.shape)
print("Shape of train_targets", train_targets.shape)
print("Shape of test_targets", test_targets.shape)

embedding_index = {}

#download glove before this
f = open("../../glove/glove.6B.100d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embedding_index[word] = coefs
f.close()

print("Found {} words".format(len(embedding_index)))

import numpy as np
k = 4 # k-fold
num_val_samples = len(train_data) // k
num_epochs = 5


# Embed sentences

embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM

def build_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length = maxlen))
    model.add(LSTM(32))
    model.add(Dense(1, activation = 'sigmoid'))
    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = False
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc', precision, recall])
    model.save_weights('pre_trained_glove_model.h5')
    return model

#k-fold training
val_acc_history = []
acc_history = []
for i in range(k):
    print('processing fold: #',i)
    val_data = train_data[i * num_val_samples: (i+1)*num_val_samples]
    val_targets = train_targets[i * num_val_samples : (i+1)*num_val_samples]
    
    partial_train_data = np.concatenate(
    [train_data[:i*num_val_samples],
    train_data[(i+1)*num_val_samples:]], axis = 0)
    
    partial_test_data = np.concatenate(
    [train_targets[:i*num_val_samples],
    train_targets[(i+1)*num_val_samples:]], axis = 0)
    
    model = build_model()
    history = model.fit(partial_train_data, partial_test_data, epochs = num_epochs, batch_size = 1, verbose = 0,
                       validation_data = (val_data, val_targets))
    val_acc = history.history['val_acc']
    acc = history.history['acc']
    val_acc_history.append(val_acc)
    acc_history.append(acc)

results = model.evaluate(test_data, test_targets)




Found 2796 unique tokens
Shape of data  (316, 28)
Shape of label (316,)
Shape of train_data (276, 28)
Shape of test_data (40, 28)
Shape of train_targets (276,)
Shape of test_targets (40,)
Found 400000 words
processing fold: # 0
processing fold: # 1
processing fold: # 2
processing fold: # 3


In [4]:
avg_acc_history = [np.mean([x[i] for x in acc_history] ) for i in range(num_epochs)]
avg_val_acc_history = [np.mean([x[i] for x in val_acc_history] ) for i in range(num_epochs)]
import matplotlib.pyplot as plt
plt.plot(range(1, len(avg_acc_history) + 1), avg_acc_history, 'bo', label = 'training acc')
plt.plot(range(1, len(avg_val_acc_history) + 1), avg_val_acc_history, 'b', label = 'Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

<Figure size 640x480 with 1 Axes>

In [5]:
results

[0.5830968677997589, 0.7, 0.6372549176216126, 0.7714285850524902]

In [6]:
table = dict(zip([repr(d) for d in data], texts))
test_sentences = [table[repr(a)] for a in test_data]

In [7]:
y_pred = model.predict(test_data)

In [27]:
rights, wrongs, uncertains = [], [], []
import math
for i,(x,y) in enumerate(zip(test_targets, y_pred)):
    if y[0] < 0.1 or y[0] > 0.9:
        if x != round(y[0]):
            wrongs.append(test_sentences[i])
        else:
            rights.append(test_sentences[i])
    else:
        uncertains.append((test_sentences[i], x))

In [32]:
def appendToFile(lst, filename):
    with open(filename, "w") as file:
        for l in lst:
            file.write(str(l) + "\n")
        file.close()

In [33]:
len(rights) / len(test_targets)

0.15

In [34]:
len(wrongs)

2

In [35]:
appendToFile(rights, "./error/rights.txt")
appendToFile(wrongs, "./error/wrongs.txt")
appendToFile(uncertains, "./error/uncertains.txt")