# Supervised Baselines

In [1]:
import ast
import numpy as np
import pandas as pd
import keras.backend as K

from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Flatten, GlobalMaxPooling1D, Input, concatenate
from keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Lambda, Embedding
from keras import regularizers, optimizers
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from corpus_WiNER.corpus_utils import *
from gensim.models import KeyedVectors

Using TensorFlow backend.


# MLP 

Utilizando la estrategia de decaimiento exponencial.

### Cargamos los datos pre-procesados y los filtramos para obtener:

- 100000 instancias de train
- &nbsp; 20000 instancias de dev
- &nbsp; 20000 instancias de test

In [2]:
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_train_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_train_exp_decay_W_5.npz')
X_train = word_vectors.items()[0][1][:100000]
y_train = entity_vector.items()[0][1][:100000]
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_dev_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_dev_exp_decay_W_5.npz')
X_dev = word_vectors.items()[0][1][:20000]
y_dev = entity_vector.items()[0][1][:20000]
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_test_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_test_exp_decay_W_5.npz')
X_test = word_vectors.items()[0][1][:20000]
y_test = entity_vector.items()[0][1][:20000]

In [3]:
print('# word vectors:', len(X_train))
print('# non entities', list(y_train).count('O'))
print('# word vectors:', len(X_dev))
print('# non entities', list(y_dev).count('O'))
print('# word vectors:', len(X_test))
print('# non entities', list(y_test).count('O'))

# word vectors: 100000
# non entities 55825
# word vectors: 20000
# non entities 9634
# word vectors: 20000
# non entities 10410


In [4]:
y_train = [tagToInt(y) for y in y_train]
y_dev = [tagToInt(y) for y in y_dev]
y_test = [tagToInt(y) for y in y_test]
y_train[:10] # this transformation is needed to apply to_categorical() keras method

[0, 0, 0, 0, 2, 2, 2, 1, 1, 1]

In [5]:
num_classes = 5 # PER - LOC - ORG - MISC - O
# convert class vectors to binary class matrices
y_train = to_categorical(y_train, num_classes)
y_dev = to_categorical(y_dev, num_classes)
y_test = to_categorical(y_test, num_classes)

In [16]:
nodes1 = 300
nodes2 = 512
lr = 0.001
l2 = 0.01
drop = 0.1

model = Sequential()    
model.add(Dense(nodes1,
                input_shape=(300,),
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)
         )
)      
model.add(Dropout(drop))   

model.add(Dense(nodes2,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))
model.add(Dense(nodes2,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))

model.add(Dense(256,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))
model.add(Dense(256,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))


model.add(Dropout(drop))
model.add(Dense(128,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))
model.add(Dense(5, activation = 'softmax')) # PER - LOC - ORG - MISC - O

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 300)               90300     
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 512)               154112    
_________________________________________________________________
dense_27 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_28 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_29 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
__________

In [18]:
model.compile(optimizer = optimizers.Adadelta(lr = lr),
              loss = categorical_crossentropy,
              metrics = ['accuracy'])

In [19]:
batch_size = 512
epochs = 10

In [20]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_dev, y_dev))

Train on 100000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# CNN 

Utilizando ventana simétrica de palabras que rodea a la objetivo.

### Cargamos los datos pre-procesados y los filtramos para obtener:

- 100000 instancias de train
- &nbsp; 20000 instancias de dev
- &nbsp; 20000 instancias de test

In [2]:
data_train = pd.read_csv('./corpus_WiNER/cnn_instances/words_entity_W_2_cnn_train.csv')
data_dev = pd.read_csv('./corpus_WiNER/cnn_instances/words_entity_W_2_cnn_dev.csv')
data_test = pd.read_csv('./corpus_WiNER/cnn_instances/words_entity_W_2_cnn_test.csv')
data_train.head()

Unnamed: 0,words,entityType
0,"['', '', 'Watching', 'Ellie', 'is']",MISC
1,"['', 'Watching', 'Ellie', 'is', 'an']",MISC
2,"['Watching', 'Ellie', 'is', 'an', 'American']",O
3,"['is', 'an', 'American', 'sitcom', 'that']",LOC
4,"['that', 'starred', 'Julia', 'Louis-Dreyfus', ...",PER


In [3]:
X_train = data_train['words'].values[:100000]
y_train = data_train['entityType'].values[:100000]
X_dev = data_dev['words'].values[:20000]
y_dev = data_dev['entityType'].values[:20000]
X_test = data_test['words'].values[:20000]
y_test = data_test['entityType'].values[:20000]

In [4]:
print('# instances:', len(X_train))
print('# non entities', list(y_train).count('O'))
print('# instances:', len(X_dev))
print('# non entities', list(y_dev).count('O'))
print('# instances:', len(X_test))
print('# non entities', list(y_test).count('O'))

# instances: 100000
# non entities 54508
# instances: 20000
# non entities 9590
# instances: 20000
# non entities 10856


In [5]:
y_train = [tagToInt(y) for y in y_train]
y_dev = [tagToInt(y) for y in y_dev]
y_test = [tagToInt(y) for y in y_test]
y_train[:10] # this transformation is needed to apply to_categorical() keras method

[3, 3, 4, 1, 0, 0, 0, 0, 4, 2]

In [9]:
def transform_input(instances, mapping):
    """Replaces the words in instances with their index in mapping.
    Args:
        instances: a list of text instances.
        mapping: an dictionary from words to indices.
    Returns:
        A matrix with shape (n_instances, m_words)."""
    word_indices = []
    for instance in instances:
        l = []
        for word in ast.literal_eval(instance):
            try:
                l.append(mapping[word].index)
            except KeyError:
                l.append(0) # index to '</s>' word vector
        word_indices.append(l)
        
    return word_indices

In [7]:
w2v_model = KeyedVectors.load('./models/google/word2vecGoogle.model')

In [8]:
X_train = transform_input(X_train, w2v_model.vocab)
X_dev = transform_input(X_dev, w2v_model.vocab)
X_test = transform_input(X_test, w2v_model.vocab)

In [9]:
batch_size = 512  # For mini-batch gradient descent
num_classes = 5 # PER - LOC - ORG - MISC - O
epochs = 10
len_words = 5
input_size = len_words # amount of words by row
train_examples = len(X_train)
test_examples = len(X_test)
# convert class vectors to binary class matrices
y_train = to_categorical(y_train, num_classes)
y_dev = to_categorical(y_dev, num_classes)
y_test = to_categorical(y_test, num_classes)

In [10]:
# https://stackoverflow.com/questions/43396572/dimension-of-shape-in-conv1d
steps = 5 # number of words in the sentence
channels = 1
input_shape = (steps, channels) #3D tensor with shape: `(batch, steps, channels)`
# # Output shape
#     3D tensor with shape: `(batch, new_steps, filters)`
X_train = np.asarray(X_train)
X_dev = np.asarray(X_dev)
X_test = np.asarray(X_test)

In [11]:
X_train.shape

(100000, 5)

In [12]:
X_train[:5]

array([[    0,     0, 14985, 28491,     4],
       [    0, 14985, 28491,     4,    27],
       [14985, 28491,     4,    27,   259],
       [    4,    27,   259, 17470,     3],
       [    3,  9483, 10751,     0,     0]])

### CNN Model

In [13]:
conv_filters = 10
pool_size = 2
inp = Input(shape=(X_train.shape[1],))
print(inp.shape)

(?, 5)


In [14]:
emb = Embedding(len(w2v_model.vocab),  # Vocabulary size
                w2v_model.vector_size, # Embedding size
                weights=[w2v_model.vectors], # Word vectors
                trainable=False  # This indicates the word vectors must not be changed
                                 # during training.
      )(inp)
print(emb.shape)
# The output here has shape (batch_size (?), words_in_reviews (?), embedding_size)

(?, 5, 300)


In [15]:
# Input shape
#     3D tensor with shape: `(batch, steps, channels)`

# Output shape
#     3D tensor with shape: `(batch, new_steps, filters)`
#     `steps` value might have changed due to padding or strides.

# Specify each convolution layer and their kernel size i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=2, activation='relu')(emb)
btch1_1 = BatchNormalization()(conv1_1)
maxp1_1 = MaxPooling1D(pool_size=pool_size)(btch1_1)
flat1_1 = Flatten()(maxp1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=3, activation='relu')(emb)
btch1_2 = BatchNormalization()(conv1_2)
maxp1_2 = MaxPooling1D(pool_size=pool_size)(btch1_2)
flat1_2 = Flatten()(maxp1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=4, activation='relu')(emb)
btch1_3 = BatchNormalization()(conv1_3)
maxp1_3 = MaxPooling1D(pool_size=pool_size)(btch1_3)
flat1_3 = Flatten()(maxp1_3)

# Gather all convolution layers
cnct = concatenate([flat1_1, flat1_2, flat1_3], axis=1)
drp1 = Dropout(0)(cnct)

dns1  = Dense(128, activation='relu')(drp1)
out = Dense(num_classes, activation='softmax')(dns1)

In [16]:
model = Model(inputs=inp, outputs=out)

In [17]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 5, 300)       900000000   input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 4, 10)        6010        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 3, 10)        9010        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [18]:
model.compile(loss=categorical_crossentropy,
              optimizer=optimizers.Adadelta(),
              metrics=['accuracy'])

In [19]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_dev, y_dev))

Train on 100000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [2]:
metrics = pd.read_csv('./models/cnn_metrics.csv')

In [5]:
sorted_metrics = metrics.sort_values(by=['dev_acc'], ascending=False)
sorted_metrics.head(10)

Unnamed: 0,model_name,train_loss,dev_loss,train_acc,dev_acc
32,cnn_num_filters_100_pool_size_1_drop_0.5_l2_1....,0.173732,0.973376,0.96393,0.80485
23,cnn_num_filters_10_pool_size_1_drop_0.5_l2_0.5...,0.418569,0.637613,0.86466,0.8048
31,cnn_num_filters_100_pool_size_1_drop_0.5_l2_0....,0.150977,0.993586,0.96606,0.8047
26,cnn_num_filters_50_pool_size_1_drop_0.5_l2_0.5...,0.201993,0.876053,0.94491,0.8046
24,cnn_num_filters_10_pool_size_1_drop_0.5_l2_1.0...,0.436353,0.634725,0.86098,0.80265
27,cnn_num_filters_50_pool_size_1_drop_0.5_l2_1.0...,0.227528,0.8519,0.9409,0.8025
30,cnn_num_filters_100_pool_size_1_drop_0.3_l2_0....,0.084707,1.127969,0.98377,0.801
25,cnn_num_filters_50_pool_size_1_drop_0.3_l2_0.3...,0.122047,1.092002,0.97107,0.8008
18,cnn_num_filters_100_pool_size_2_drop_0.5_l2_0....,0.14158,1.062206,0.96539,0.79795
8,cnn_num_filters_20_pool_size_2_drop_0.5_l2_0.5...,0.328026,0.738538,0.89535,0.7976


In [6]:
model_name = list(sorted_metrics['model_name'])[0]
print(model_name)
best_model = load_model('./models/saved/' + model_name + '.h5')

cnn_num_filters_100_pool_size_1_drop_0.5_l2_1.0_batch_size_512_epochs_100


In [10]:
test_data = pd.read_csv('./corpus_WiNER/cnn_instances/words_entity_W_2_cnn_test.csv')
X_test = test_data['words'].values[:20000]
y_test = test_data['entityType'].values[:20000]
w2v_model = KeyedVectors.load('./models/google/word2vecGoogle.model')
X_test = np.asarray(transform_input(X_test, w2v_model.vocab))
y_test = [tagToInt(y) for y in y_test]
y_test = to_categorical(y_test, 5)

In [13]:
performance = best_model.evaluate(X_test, y_test)



In [17]:
print('Test loss: {}'.format(performance[0]))
print('Test accuracy: {}'.format(performance[1]))

Test loss: 0.8748557825446129
Test accuracy: 0.827
