# GRU  + ElMO



In [1]:
import pandas as pd
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id")
df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id")

text_train, y_train = df_train["text"].values, df_train["HS"].values
text_dev, y_dev = df_dev["text"].values, df_dev["HS"].values

Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [4]:
from embeddings.tokenizer import TweetTokenizer

max_length = 30

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)

def preprocess_tweet(tweet):
    tokens = tokenizer(tweet)
    
    if len(tokens) >= max_length:
        tokens = tokens[:max_length]
    else:
        tokens = tokens + [''] * (max_length - len(tokens))
    return tokens


text_train = [preprocess_tweet(tweet) for tweet in df_train["text"].values]
text_dev = [preprocess_tweet(tweet) for tweet in df_dev["text"].values]

In [5]:
from elmoformanylangs import Embedder

e = Embedder("../../../models/elmo/es/")

2019-01-10 17:35:35,922 INFO: char embedding size: 2637
2019-01-10 17:35:36,919 INFO: word embedding size: 185214
2019-01-10 17:35:46,340 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(185214, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(2637, 50, padding_idx=2634)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

Carguemos embeddings

In [6]:
import numpy as np

print(text_train[0])


X_train = np.array(e.sents2elmo(text_train))
X_dev = np.array(e.sents2elmo(text_dev))

['easyjet', 'quiere', 'duplicar', 'el', 'número', 'de', 'mujeres', 'piloto', "'", 'verás', 'tú', 'para', 'aparcar', 'el', 'avión', '..', 'url', '', '', '', '', '', '', '', '', '', '', '', '', '']


2019-01-10 17:35:53,423 INFO: 70 batches, avg len: 32.0
2019-01-10 17:35:55,803 INFO: Finished 1000 sentences.
2019-01-10 17:35:57,817 INFO: Finished 2000 sentences.
2019-01-10 17:35:59,785 INFO: Finished 3000 sentences.
2019-01-10 17:36:01,656 INFO: Finished 4000 sentences.
2019-01-10 17:36:03,551 INFO: 8 batches, avg len: 32.0


In [8]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, Dropout, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.preprocessing import sequence

embedding_dim = 1024

model = Sequential()
model.add(Bidirectional(GRU(256, input_shape=(max_length, embedding_dim))))
model.add(Dropout(0.75))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(1, activation='sigmoid'))

optimizer_args = {
    "lr": 0.0005,
    "decay": 0.01
}
model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])



In [10]:
model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=5, batch_size=32)


Train on 4469 samples, validate on 500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1b9853a898>

In [11]:
from hate.utils import print_evaluation

print_evaluation(model, X_dev, y_dev)

Loss        : 0.4475
Accuracy    : 0.7980
Precision   : 0.7713
Recall      : 0.7748
F1          : 0.7730
