# Elmo + Embeddings

Probemos si usando también los embeddings de fastText obtenemos algo razonable...



In [1]:
import pandas as pd
import csv
import numpy as np
import tensorflow as tf
import random
import torch

torch.manual_seed(2019)
np.random.seed(2019)
tf.random.set_random_seed(2019)
random.seed(2019)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_test = pd.read_table("../../../data/es/reference_es.tsv", header=None, 
                        names=["text", "HS", "TR", "AG"], quoting=csv.QUOTE_NONE)


text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]
text_test, y_test = df_test["text"], df_test["HS"]

print("Instancias de entrenamiento: {}".format(len(df_train)))
print("Instancias de desarrollo: {}".format(len(df_dev)))
print("Instancias de test: {}".format(len(df_test)))


Instancias de entrenamiento: 4500
Instancias de desarrollo: 500
Instancias de test: 1600


In [2]:
import fastText
import os


model = fastText.load_model(os.path.expanduser("~/WordVectors/UBA_w5_300.bin"))

In [3]:
embedding_shape = model.get_word_vector("pepe").shape


In [4]:
from scipy.spatial.distance import cosine as cosine_distance

def cos_similarity(v1, v2):
    return 1 - cosine_distance(v1, v2)

cos_similarity(model.get_word_vector("angau"), model.get_word_vector("anga"))

0.6086544990539551

Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [5]:
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.sequence import pad_sequences


max_length = 40

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def preprocess_tweet(tweet):
    tokens = tokenizer.tokenize(tweet)
    
    if len(tokens) >= max_length:
        tokens = tokens[:max_length]
    else:
        tokens = tokens + [''] * (max_length - len(tokens))
    return tokens


tokens_train = [preprocess_tweet(tweet) for tweet in df_train["text"].values]
tokens_dev = [preprocess_tweet(tweet) for tweet in df_dev["text"].values]
tokens_test = [preprocess_tweet(tweet) for tweet in df_test["text"].values]

Using TensorFlow backend.


In [6]:
%%capture
from elmoformanylangs import Embedder

e = Embedder("../../../models/elmo/es/")

Carguemos embeddings

In [9]:
e.sents2elmo?

[0;31mSignature:[0m [0me[0m[0;34m.[0m[0msents2elmo[0m[0;34m([0m[0msents[0m[0;34m,[0m [0moutput_layer[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/.pyenv/versions/3.6.5/envs/hateval/lib/python3.6/site-packages/elmoformanylangs/elmo.py
[0;31mType:[0m      method


In [10]:
import numpy as np


X_train = np.array(e.sents2elmo(tokens_train))
X_dev = np.array(e.sents2elmo(tokens_dev))
X_test = np.array(e.sents2elmo(tokens_test))

In [11]:
def get_embeddings(toks):
    ret = []
    
    for tok in toks:
        vec = model.get_word_vector(tok)
        ret.append(vec)
    return ret

X_emb_train = np.array([get_embeddings(toks) for toks in tokens_train])
X_emb_dev = np.array([get_embeddings(toks) for toks in tokens_dev])
X_emb_test = np.array([get_embeddings(toks) for toks in tokens_test])

X_emb_train.shape, X_emb_dev.shape, X_emb_test.shape

((4500, 30, 300), (500, 30, 300), (1600, 30, 300))

In [22]:
from keras.models import Model
from keras.layers import Dense, Embedding, Dropout, CuDNNLSTM, CuDNNGRU, Input, Concatenate, Bidirectional
from keras.optimizers import Adam
from keras.preprocessing import sequence

embedding_dim = 1024

optimizer_args = {
    "lr": 0.0005,
    "decay": 0.01,
}

elmo_input = Input(shape=X_train[0].shape)
emb_input = Input(shape=X_emb_train[0].shape)

x = Concatenate()([elmo_input, emb_input])
x = Bidirectional(CuDNNLSTM(256))(x)
x = Dropout(0.80)(x)
x = Dense(128)(x)
x = Dropout(0.55)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[elmo_input, emb_input], outputs=[output])

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())

model.fit([X_train, X_emb_train], y_train, 
          validation_data=([X_dev, X_emb_dev], y_dev), epochs=30, batch_size=32)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 30, 1024)     0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 30, 300)      0                                            
__________________________________________________________________________________________________
concatenate_7 (Concatenate)     (None, 30, 1324)     0           input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) (None, 512)          3239936     concatenate_7[0][0]              
__________

<keras.callbacks.History at 0x7fc72f80c240>

In [23]:
from hate.utils import print_evaluation
print("biLSTM - Elmo+Embeddings -- \n\n")
print("Evaluación sobre dev")
print_evaluation(model, [X_dev, X_emb_dev], y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, [X_test, X_emb_test], y_test)



biLSTM - Elmo+Embeddings -- 


Evaluación sobre dev
Loss           : 0.4560
Accuracy       : 0.8220
Precision(1)   : 0.8122
Precision(1)   : 0.8293
Precision(avg) : 0.8207

Recall(1)      : 0.7793
Recall(0)      : 0.8561
Recall(avg)    : 0.8177

F1(1)          : 0.7954
F1(0)          : 0.8425
F1(avg)        : 0.8189


Evaluación sobre test
Loss           : 0.6082
Accuracy       : 0.7400
Precision(1)   : 0.6699
Precision(1)   : 0.7971
Precision(avg) : 0.7335

Recall(1)      : 0.7288
Recall(0)      : 0.7479
Recall(avg)    : 0.7383

F1(1)          : 0.6981
F1(0)          : 0.7717
F1(avg)        : 0.7349


# Bidirectional GRU sin densa

In [40]:
embedding_dim = 1024

optimizer_args = {
    "lr": 0.0005,
    "decay": 0.01,
}

elmo_input = Input(shape=X_train[0].shape)
emb_input = Input(shape=X_emb_train[0].shape)

x = Concatenate()([elmo_input, emb_input])
x = Bidirectional(CuDNNGRU(256))(x)
x = Dropout(0.65)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[elmo_input, emb_input], outputs=[output])

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())

model.fit([X_train, X_emb_train], y_train, 
          validation_data=([X_dev, X_emb_dev], y_dev), epochs=12, batch_size=32)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_39 (InputLayer)           (None, 30, 1024)     0                                            
__________________________________________________________________________________________________
input_40 (InputLayer)           (None, 30, 300)      0                                            
__________________________________________________________________________________________________
concatenate_20 (Concatenate)    (None, 30, 1324)     0           input_39[0][0]                   
                                                                 input_40[0][0]                   
__________________________________________________________________________________________________
bidirectional_23 (Bidirectional (None, 512)          2429952     concatenate_20[0][0]             
__________

<keras.callbacks.History at 0x7fc3d38f9eb8>

In [41]:
from hate.utils import print_evaluation
print("biLSTM - Elmo+Embeddings -- \n\n")
print("Evaluación sobre dev")
print_evaluation(model, [X_dev, X_emb_dev], y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, [X_test, X_emb_test], y_test)



biLSTM - Elmo+Embeddings -- 


Evaluación sobre dev
Loss           : 0.4459
Accuracy       : 0.8000
Precision(1)   : 0.7933
Precision(1)   : 0.8048
Precision(avg) : 0.7990

Recall(1)      : 0.7432
Recall(0)      : 0.8453
Recall(avg)    : 0.7943

F1(1)          : 0.7674
F1(0)          : 0.8246
F1(avg)        : 0.7960


Evaluación sobre test
Loss           : 0.5869
Accuracy       : 0.7481
Precision(1)   : 0.6962
Precision(1)   : 0.7841
Precision(avg) : 0.7402

Recall(1)      : 0.6909
Recall(0)      : 0.7883
Recall(avg)    : 0.7396

F1(1)          : 0.6935
F1(0)          : 0.7862
F1(avg)        : 0.7399
