In [1]:
import os
import random
import numpy as np

from collections import namedtuple

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


# TREC - Question Answering (multi-class)

In [2]:
# TREC - Question Answering
# http://cogcomp.cs.illinois.edu/Data/QA/QC/

TREC_Question = namedtuple("TREC_Question", "label question")

trec_train = set()
trec_test = set()

for filename in os.listdir("TREC/"):
    with open("TREC/"+filename,'r', encoding='latin_1') as f_input:
        for line in f_input:
            label, question = line.split(' ', 1)
            label = label.split(':')[0]
            question = TREC_Question(label, question.strip())
            if filename=='TREC_10.label':
                trec_test.add(question)
            else:
                trec_train.add(question)

In [3]:
print("Train Samples: {}".format(len(trec_train)))
print("Test Samples : {}".format(len(trec_test)))
print("Labels       : {}".format({x.label for x in trec_train}))

Train Samples: 5381
Test Samples : 500
Labels       : {'NUM', 'LOC', 'ABBR', 'HUM', 'DESC', 'ENTY'}


In [4]:
# built two lists with sentences and labels
questions_train = [x.question for x in trec_train]
labels_train = [x.label for x in trec_train]

# convert list of tokens/words to indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions_train)
sequences_train = tokenizer.texts_to_sequences(questions_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# get the max sentence lenght, needed for padding
max_input_lenght = max([len(x) for x in sequences_train])
print("Max. sequence lenght: ", max_input_lenght)

# pad all the sequences of indexes to the 'max_input_lenght'
data_train = pad_sequences(sequences_train, maxlen=max_input_lenght, padding='post', truncating='post')

# Encode the labels, each must be a vector with dim = num. of possible labels
le = LabelEncoder()
le.fit(labels_train)
labels_encoded_train = le.transform(labels_train)
categorical_labels_train = to_categorical(labels_encoded_train, num_classes=None)
print('Shape of train data tensor:', data_train.shape)
print('Shape of train label tensor:', categorical_labels_train.shape)

Found 8461 unique tokens.
Max. sequence lenght:  33
Shape of train data tensor: (5381, 33)
Shape of train label tensor: (5381, 6)


## TREC: test data

In [32]:
# pre-process test data
questions_test = [x.question for x in trec_test]
y_test = [x.label for x in trec_test]
sequences_test = tokenizer.texts_to_sequences(questions_test)
x_test = pad_sequences(sequences_test, maxlen=max_input_lenght)
print('Shape of test data tensor:', x_test.shape)

Shape of test data tensor: (500, 33)


In [6]:
from convnets_utils import *

# CNN with random word embeddings

In [24]:
model_1 = get_cnn_rand(200, len(word_index)+1, max_input_lenght, 6)

In [38]:
history = model_1.fit(x=data_train, y=categorical_labels_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


In [49]:
raw_predictions = model_1.predict(x_test)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

             precision    recall  f1-score   support

       ABBR       0.00      0.00      0.00         9
       DESC       0.34      0.95      0.50       138
       ENTY       0.00      0.00      0.00        94
        HUM       0.00      0.00      0.00        65
        LOC       0.00      0.00      0.00        81
        NUM       0.00      0.00      0.00       113

avg / total       0.09      0.26      0.14       500



  'precision', 'predicted', average, warn_for)


# CNN with pre-trained static word embeddings

In [42]:
embeddings_index = load_embeddings()
embeddings_matrix = create_embeddings_matrix(embeddings_index, len(word_index)+1, 100)
embedding_layer_static = get_embeddings_layer(embeddings_matrix, 'embedding_layer_static', max_input_lenght, trainable=False)
model_2 = get_cnn_pre_trained_embeddings(embedding_layer_static, max_input_lenght, 6)

Found 400000 word vectors.
Matrix shape: (8463, 100)


In [43]:
history = model_2.fit(x=data_train, y=categorical_labels_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


In [44]:
raw_predictions = model_2.predict(x_test)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

             precision    recall  f1-score   support

       ABBR       0.00      0.00      0.00         9
       DESC       0.28      1.00      0.43       138
       ENTY       0.00      0.00      0.00        94
        HUM       0.00      0.00      0.00        65
        LOC       0.00      0.00      0.00        81
        NUM       0.00      0.00      0.00       113

avg / total       0.08      0.28      0.12       500



  'precision', 'predicted', average, warn_for)


# CNN with pre-trained dynamic word embeddings

In [52]:
embedding_layer_dynamic = get_embeddings_layer(embeddings_matrix, 'embedding_layer_dynamic', max_input_lenght, trainable=True)
model_3 = get_cnn_pre_trained_embeddings(embedding_layer_dynamic, max_input_lenght, 6)

In [53]:
history = model_3.fit(x=data_train, y=categorical_labels_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


In [55]:
raw_predictions = model_3.predict(x_test)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

             precision    recall  f1-score   support

       ABBR       0.00      0.00      0.00         9
       DESC       0.29      0.98      0.44       138
       ENTY       0.00      0.00      0.00        94
        HUM       0.00      0.00      0.00        65
        LOC       0.00      0.00      0.00        81
        NUM       0.00      0.00      0.00       113

avg / total       0.08      0.27      0.12       500



  'precision', 'predicted', average, warn_for)


# CNN multichanell with pre-trained dynamic and static word embeddings

In [65]:
model_4 = get_cnn_multichannel(embedding_layer_static, embedding_layer_dynamic, max_input_lenght, 6)

In [66]:
history = model_4.fit(x=[data_train, data_train], y=categorical_labels_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


In [68]:
raw_predictions = model_4.predict([x_test, x_test])
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

             precision    recall  f1-score   support

       ABBR       0.00      0.00      0.00         9
       DESC       0.28      1.00      0.43       138
       ENTY       0.00      0.00      0.00        94
        HUM       0.00      0.00      0.00        65
        LOC       0.00      0.00      0.00        81
        NUM       0.00      0.00      0.00       113

avg / total       0.08      0.28      0.12       500



  'precision', 'predicted', average, warn_for)
