# Supervised Baselines

In [1]:
import numpy as np
import keras.backend as K

from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Dropout, Embedding, Lambda
from keras import optimizers
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

Using TensorFlow backend.


# MLP 

Utilizando la estrategia de decaimiento exponencial.

### Cargamos los datos pre-procesados y los filtramos para obtener:

- 100000 instancias de train
- &nbsp; 20000 instancias de dev
- &nbsp; 20000 instancias de test

In [2]:
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_train_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_train_exp_decay_W_5.npz')
X_train = word_vectors.items()[0][1][:100000]
y_train = entity_vector.items()[0][1][:100000]
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_dev_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_dev_exp_decay_W_5.npz')
X_dev = word_vectors.items()[0][1][:20000]
y_dev = entity_vector.items()[0][1][:20000]
word_vectors = np.load('./corpus_WiNER/word_vectors/wv_test_exp_decay_W_5.npz')
entity_vector = np.load('./corpus_WiNER/entity_vectors/ev_test_exp_decay_W_5.npz')
X_test = word_vectors.items()[0][1][:20000]
y_test = entity_vector.items()[0][1][:20000]

In [3]:
print('# word vectors:', len(X_train))
print('# non entities', list(y_train).count('O'))
print('# word vectors:', len(X_dev))
print('# non entities', list(y_dev).count('O'))
print('# word vectors:', len(X_test))
print('# non entities', list(y_test).count('O'))

# word vectors: 100000
# non entities 55825
# word vectors: 20000
# non entities 9634
# word vectors: 20000
# non entities 10410


In [4]:
def tagToInt(tag):
    return {'PER': 0, 'LOC': 1, 'ORG': 2, 'MISC': 3, 'O': 4}[tag]    
y_train = [tagToInt(y) for y in y_train]
y_dev = [tagToInt(y) for y in y_dev]
y_test = [tagToInt(y) for y in y_test]

y_train[:10] # this transformation is needed to apply to_categorical() keras method

[0, 0, 0, 0, 2, 2, 2, 1, 1, 1]

In [5]:
num_classes = 5 # PER - LOC - ORG - MISC - O
# convert class vectors to binary class matrices
y_train = to_categorical(y_train, num_classes)
y_dev = to_categorical(y_dev, num_classes)
y_test = to_categorical(y_test, num_classes)

In [16]:
# # Función que genera los modelos y que se usara en la grilla de validación cruzada.
# def build_model2(nodes1 = 100, nodes2 = 200, lr = 0.001, 
#                 l2 = 0.01, drop = 0.1):
nodes1 = 300
nodes2 = 512
lr = 0.001
l2 = 0.01
drop = 0.1

model = Sequential()    
model.add(Dense(nodes1,
                input_shape=(300,),
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)
         )
)      
model.add(Dropout(drop))   

model.add(Dense(nodes2,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))
model.add(Dense(nodes2,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))

model.add(Dense(256,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))
model.add(Dense(256,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))


model.add(Dropout(drop))
model.add(Dense(128,
                activation = 'relu',
                kernel_regularizer = regularizers.l2(l2)))
model.add(Dense(5, activation = 'softmax')) # PER - LOC - ORG - MISC - O

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 300)               90300     
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 512)               154112    
_________________________________________________________________
dense_27 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_28 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_29 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
__________

In [18]:
model.compile(optimizer = optimizers.Adadelta(lr = lr),
              loss = categorical_crossentropy,
              metrics = ['accuracy'])

In [19]:
batch_size = 512
epochs = 10

In [20]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_dev, y_dev))

Train on 100000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# CNN 

Utilizando ventana simétrica de palabras que rodea a la objetivo.

### Cargamos los datos pre-procesados y los filtramos para obtener:

- 100000 instancias de train
- &nbsp; 20000 instancias de dev
- &nbsp; 20000 instancias de test

In [2]:
input_data = pd.read_csv('./corpus_WiNER/words_entity_cnn_test.csv')
print(input_data.shape)
input_data['entityType'].value_counts()

(518696, 2)


O       273639
MISC     91899
LOC      58713
PER      57497
ORG      36948
Name: entityType, dtype: int64

In [3]:
input_data.head()

Unnamed: 0,words,entityType
0,"['', '', 'Pier', 'Francesco', ""d'Jacopo""]",PER
1,"['', 'Pier', 'Francesco', ""d'Jacopo"", 'di']",PER
2,"['Pier', 'Francesco', ""d'Jacopo"", 'di', 'Domen...",PER
3,"['Francesco', ""d'Jacopo"", 'di', 'Domenico', 'T...",PER
4,"[""d'Jacopo"", 'di', 'Domenico', 'Toschi', '-LRB-']",PER
