# Elmo + Embeddings

Probemos si usando también los embeddings de fastText obtenemos algo razonable...



In [1]:
import pandas as pd
import csv
import numpy as np
import tensorflow as tf
import random
import torch

torch.manual_seed(2019)
np.random.seed(2019)
tf.random.set_random_seed(2019)
random.seed(2019)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_test = pd.read_table("../../../data/es/reference_es.tsv", header=None, 
                        names=["text", "HS", "TR", "AG"], quoting=csv.QUOTE_NONE)


text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]
text_test, y_test = df_test["text"], df_test["HS"]

print("Instancias de entrenamiento: {}".format(len(df_train)))
print("Instancias de desarrollo: {}".format(len(df_dev)))
print("Instancias de test: {}".format(len(df_test)))


Instancias de entrenamiento: 4500
Instancias de desarrollo: 500
Instancias de test: 1600


In [2]:
import fastText
import os


model = fastText.load_model(os.path.expanduser("../../../WordVectors/wiki.es.bin"))

In [3]:
embedding_shape = model.get_word_vector("pepe").shape


Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [6]:
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.sequence import pad_sequences


max_length = 40

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def preprocess_tweet(tweet):
    tokens = tokenizer.tokenize(tweet)
    
    if len(tokens) >= max_length:
        tokens = tokens[:max_length]
    else:
        tokens = tokens + [''] * (max_length - len(tokens))
    return tokens


tokens_train = [preprocess_tweet(tweet) for tweet in df_train["text"].values]
tokens_dev = [preprocess_tweet(tweet) for tweet in df_dev["text"].values]
tokens_test = [preprocess_tweet(tweet) for tweet in df_test["text"].values]

Using TensorFlow backend.


In [7]:
%%capture
from elmoformanylangs import Embedder

e = Embedder("../../../models/elmo/es/")

Carguemos embeddings

In [8]:
import numpy as np


X_train = np.array(e.sents2elmo(tokens_train))
X_dev = np.array(e.sents2elmo(tokens_dev))
X_test = np.array(e.sents2elmo(tokens_test))

In [9]:
def get_embeddings(toks):
    ret = []
    
    for tok in toks:
        vec = model.get_word_vector(tok)
        ret.append(vec)
    return ret

X_emb_train = np.array([get_embeddings(toks) for toks in tokens_train])
X_emb_dev = np.array([get_embeddings(toks) for toks in tokens_dev])
X_emb_test = np.array([get_embeddings(toks) for toks in tokens_test])

X_emb_train.shape, X_emb_dev.shape, X_emb_test.shape

((4500, 40, 300), (500, 40, 300), (1600, 40, 300))

In [30]:
from keras.models import Model
from keras.layers import Dense, Embedding, Dropout, CuDNNLSTM, CuDNNGRU, Input, Concatenate, Bidirectional
from keras.optimizers import Adam
from keras.preprocessing import sequence

embedding_dim = 1024

optimizer_args = {
    "lr": 0.00025,
    "decay": 0.01,
}

elmo_input = Input(shape=X_train[0].shape)
emb_input = Input(shape=X_emb_train[0].shape)

x = Concatenate()([elmo_input, emb_input])
x = Bidirectional(CuDNNLSTM(256))(x)
x = Dropout(0.50)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[elmo_input, emb_input], outputs=[output])

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 40, 1024)     0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 40, 300)      0                                            
__________________________________________________________________________________________________
concatenate_9 (Concatenate)     (None, 40, 1324)     0           input_17[0][0]                   
                                                                 input_18[0][0]                   
__________________________________________________________________________________________________
bidirectional_9 (Bidirectional) (None, 512)          3239936     concatenate_9[0][0]              
__________

In [31]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint('/tmp/lstm_model.h5', save_best_only=True, monitor='val_acc', verbose=1)
early_stopper = EarlyStopping(monitor='val_loss', patience=5)


model.fit([X_train, X_emb_train], y_train, 
          callbacks=[checkpointer, early_stopper],
          validation_data=([X_dev, X_emb_dev], y_dev), epochs=100, batch_size=32)


Train on 4500 samples, validate on 500 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.68200, saving model to /tmp/lstm_model.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.68200 to 0.73000, saving model to /tmp/lstm_model.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.73000 to 0.75600, saving model to /tmp/lstm_model.h5
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.75600
Epoch 5/100

Epoch 00005: val_acc improved from 0.75600 to 0.78200, saving model to /tmp/lstm_model.h5
Epoch 6/100

Epoch 00006: val_acc improved from 0.78200 to 0.80800, saving model to /tmp/lstm_model.h5
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.80800
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.80800
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.80800
Epoch 10/100

Epoch 00010: val_acc improved from 0.80800 to 0.81000, saving model to /tmp/lstm_model.h5
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.81000


<keras.callbacks.History at 0x7f6640a35dd8>

In [33]:
from hate.utils import print_evaluation
print("biLSTM - Elmo+Embeddings -- \n\n")
print("Evaluación sobre dev")

model.load_weights(checkpointer.filepath)

print_evaluation(model, [X_dev, X_emb_dev], y_dev)




biLSTM - Elmo+Embeddings -- 


Evaluación sobre dev
Loss           : 0.4614
Accuracy       : 0.8100
Precision(1)   : 0.7679
Precision(1)   : 0.8479
Precision(avg) : 0.8079

Recall(1)      : 0.8198
Recall(0)      : 0.8022
Recall(avg)    : 0.8110

F1(1)          : 0.7930
F1(0)          : 0.8244
F1(avg)        : 0.8087


In [36]:
embedding_dim = 1024

optimizer_args = {
    "lr": 0.0005,
    "decay": 0.01,
}

elmo_input = Input(shape=X_train[0].shape)
emb_input = Input(shape=X_emb_train[0].shape)

x = Concatenate()([elmo_input, emb_input])
x = Bidirectional(CuDNNGRU(256))(x)
x = Dropout(0.65)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[elmo_input, emb_input], outputs=[output])

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 40, 1024)     0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 40, 300)      0                                            
__________________________________________________________________________________________________
concatenate_11 (Concatenate)    (None, 40, 1324)     0           input_21[0][0]                   
                                                                 input_22[0][0]                   
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional (None, 512)          2429952     concatenate_11[0][0]             
__________

In [37]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint('/tmp/lstm_model.h5', save_best_only=True, monitor='val_acc', verbose=1)
early_stopper = EarlyStopping(monitor='val_loss', patience=5)


model.fit([X_train, X_emb_train], y_train, 
          callbacks=[checkpointer, early_stopper],
          validation_data=([X_dev, X_emb_dev], y_dev), epochs=100, batch_size=32)


Train on 4500 samples, validate on 500 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.66200, saving model to /tmp/lstm_model.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.66200 to 0.74000, saving model to /tmp/lstm_model.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.74000 to 0.77800, saving model to /tmp/lstm_model.h5
Epoch 4/100

Epoch 00004: val_acc improved from 0.77800 to 0.79800, saving model to /tmp/lstm_model.h5
Epoch 5/100

Epoch 00005: val_acc improved from 0.79800 to 0.82200, saving model to /tmp/lstm_model.h5
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.82200
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.82200
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.82200
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.82200
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.82200
Epoch 11/100

Epoch 00011: val_acc improved from 0.82200 to 0.82400, saving model to /tmp/lstm_model.h5
Epoch 12/100

Epoch 0

<keras.callbacks.History at 0x7f6640800518>

In [38]:
from hate.utils import print_evaluation
print("biGRU - Elmo+Embeddings -- \n\n")
print("Evaluación sobre dev")
print_evaluation(model, [X_dev, X_emb_dev], y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, [X_test, X_emb_test], y_test)



biLSTM - Elmo+Embeddings -- 


Evaluación sobre dev
Loss           : 0.4569
Accuracy       : 0.8000
Precision(1)   : 0.7723
Precision(1)   : 0.8225
Precision(avg) : 0.7974

Recall(1)      : 0.7793
Recall(0)      : 0.8165
Recall(avg)    : 0.7979

F1(1)          : 0.7758
F1(0)          : 0.8195
F1(avg)        : 0.7976


Evaluación sobre test
Loss           : 0.6611
Accuracy       : 0.7381
Precision(1)   : 0.6600
Precision(1)   : 0.8076
Precision(avg) : 0.7338

Recall(1)      : 0.7530
Recall(0)      : 0.7277
Recall(avg)    : 0.7403

F1(1)          : 0.7035
F1(0)          : 0.7655
F1(avg)        : 0.7345
