In [None]:
import manager_dataset as MD
import question_processing as QP

In [None]:
questions = MD.questions(treated=True)
incosiderate_classes = ['X', 'MANNER', 'OBJECT', 'OTHER', 'DEFINITION']
questions = QP.remove_incosiderate_classes(questions, incosiderate_classes)  # Question with incosiderate class is removed

In [None]:
train_questions, test_questions = MD.split_questions(questions)

In [None]:
train_questions[0]

In [None]:
train_text = []
train_class = []

test_text = []
test_class = []

for q in train_questions:
    train_text.append(q['question'])
    train_class.append(q['class'])

for q in test_questions:
    test_text.append(q['question'])
    test_class.append(q['class'])

In [None]:
from keras.preprocessing import text


batch_size = 32
vocab_size = 1000
tokenize = text.Tokenizer(num_words=vocab_size)

tokenize.fit_on_texts(train_text)
x_train = tokenize.texts_to_matrix(train_text)
x_test = tokenize.texts_to_matrix(test_text)

In [None]:
from sklearn.preprocessing import LabelBinarizer


encoder = LabelBinarizer()
encoder.fit(train_class)
y_train = encoder.transform(train_class)
y_test = encoder.transform(test_class)
num_labels = len(encoder.classes_)
text_labels = encoder.classes_ 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Dropout

In [None]:
import numpy as np

# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

In [None]:
model = Sequential()

model.add(Dense(2048, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    batch_size=batch_size, 
                    epochs=2, 
                    verbose=1, 
                    validation_split=0.1)

In [None]:
look_back = 1

# create and fit the LSTM network
model = Sequential()
model.add(LSTM(1024, input_shape=(look_back, vocab_size)))
#model.add(Activation('relu'))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=2, batch_size=batch_size, verbose=1)

In [None]:
score = model.evaluate(x_test, y_test, 
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
import numpy as np


y_softmax = model.predict(x_test)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
import os


def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=11)
    plt.yticks(tick_marks, classes, fontsize=11)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=12)
    plt.xlabel('Predicted label', fontsize=12)
    
cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(9,7))
plot_confusion_matrix(cnf_matrix, classes=text_labels, title="Confusion matrix")
plt.show()

In [2]:
%%time
from gensim.models import Word2Vec
from gensim.models import KeyedVectors


w2v_patch = 'data/word_embedding/cbow_s50.txt'  # Fast test
w2v = KeyedVectors.load_word2vec_format(w2v_patch, unicode_errors="ignore")
model_w2v = {w: vec for w, vec in zip(w2v.index2word, w2v.syn0)}

  import sys


Wall time: 35.4 s


In [None]:
model_w2v['']