In [1]:
import pandas as pd
import csv
import numpy as np
import tensorflow as tf
import random
import torch

def 

torch.manual_seed(2019)
np.random.seed(2019)
tf.random.set_random_seed(2019)
random.seed(2019)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_test = pd.read_table("../../../data/es/reference_es.tsv", header=None, 
                        names=["text", "HS", "TR", "AG"], quoting=csv.QUOTE_NONE)


text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]
text_test, y_test = df_test["text"], df_test["HS"]

print("Instancias de entrenamiento: {}".format(len(df_train)))
print("Instancias de desarrollo: {}".format(len(df_dev)))
print("Instancias de test: {}".format(len(df_test)))


Instancias de entrenamiento: 4500
Instancias de desarrollo: 500
Instancias de test: 1600


# Elmo + Embeddings

Probemos si usando también los embeddings de fastText obtenemos algo razonable...



In [2]:
import fastText
import os


model = fastText.load_model(os.path.expanduser("../../../WordVectors/UBA_w3_300.bin"))

Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [3]:
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.sequence import pad_sequences


max_length = 40

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def preprocess_tweet(tweet):
    tokens = tokenizer.tokenize(tweet)
    
    if len(tokens) >= max_length:
        tokens = tokens[:max_length]
    else:
        tokens = tokens + [''] * (max_length - len(tokens))
    return tokens


tokens_train = [preprocess_tweet(tweet) for tweet in df_train["text"].values]
tokens_dev = [preprocess_tweet(tweet) for tweet in df_dev["text"].values]
tokens_test = [preprocess_tweet(tweet) for tweet in df_test["text"].values]

Using TensorFlow backend.


In [4]:
%%capture
from elmoformanylangs import Embedder

e = Embedder("../../../models/elmo/es/")

Carguemos embeddings

In [5]:
import numpy as np


X_train = np.array(e.sents2elmo(tokens_train))
X_dev = np.array(e.sents2elmo(tokens_dev))
X_test = np.array(e.sents2elmo(tokens_test))

In [6]:
def get_embeddings(toks):
    ret = []
    
    for tok in toks:
        vec = model.get_word_vector(tok)
        ret.append(vec)
    return ret

X_emb_train = np.array([get_embeddings(toks) for toks in tokens_train])
X_emb_dev = np.array([get_embeddings(toks) for toks in tokens_dev])
X_emb_test = np.array([get_embeddings(toks) for toks in tokens_test])

X_emb_train.shape, X_emb_dev.shape, X_emb_test.shape

((4500, 40, 300), (500, 40, 300), (1600, 40, 300))

## Con Global Max Pooling

In [7]:
from keras.models import Model
from keras.layers import Dense, Embedding, Dropout, Conv1D, CuDNNGRU, Input, Concatenate, Bidirectional, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.preprocessing import sequence

embedding_dim = 1024

optimizer_args = {
    "lr": 0.00075,
    "decay": 0.005,
}

elmo_input = Input(shape=X_train[0].shape)
emb_input = Input(shape=X_emb_train[0].shape)

x = Concatenate()([elmo_input, emb_input])
x = Conv1D(filters=40, kernel_size=3, activation='relu')(x)
x = Dropout(0.60)(x)
x = GlobalMaxPooling1D()(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[elmo_input, emb_input], outputs=[output])

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40, 1024)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40, 300)      0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 40, 1324)     0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 38, 40)       158920      concatenate_1[0][0]              
__________

In [8]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint('/tmp/cnn_elmo+fasttext_model.h5', save_best_only=True, monitor='val_acc', verbose=1)
early_stopper = EarlyStopping(monitor='val_loss', patience=20)


model.fit([X_train, X_emb_train], y_train, 
          callbacks=[checkpointer, early_stopper],
          validation_data=([X_dev, X_emb_dev], y_dev), epochs=250, batch_size=32)


Train on 4500 samples, validate on 500 samples
Epoch 1/250

Epoch 00001: val_acc improved from -inf to 0.72400, saving model to /tmp/cnn_model.h5
Epoch 2/250

Epoch 00002: val_acc improved from 0.72400 to 0.77600, saving model to /tmp/cnn_model.h5
Epoch 3/250

Epoch 00003: val_acc improved from 0.77600 to 0.80000, saving model to /tmp/cnn_model.h5
Epoch 4/250

Epoch 00004: val_acc improved from 0.80000 to 0.82200, saving model to /tmp/cnn_model.h5
Epoch 5/250

Epoch 00005: val_acc improved from 0.82200 to 0.82600, saving model to /tmp/cnn_model.h5
Epoch 6/250

Epoch 00006: val_acc did not improve from 0.82600
Epoch 7/250

Epoch 00007: val_acc did not improve from 0.82600
Epoch 8/250

Epoch 00008: val_acc did not improve from 0.82600
Epoch 9/250

Epoch 00009: val_acc did not improve from 0.82600
Epoch 10/250

Epoch 00010: val_acc did not improve from 0.82600
Epoch 11/250

Epoch 00011: val_acc did not improve from 0.82600
Epoch 12/250

Epoch 00012: val_acc did not improve from 0.82600
Ep

<keras.callbacks.History at 0x7efc47ea09b0>

In [1]:
from hate.utils import print_evaluation
print("CNN + Elmo+Embeddings -- \n\n")
print("Evaluación sobre dev")

model.load_weights(checkpointer.filepath)

print_evaluation(model, [X_dev, X_emb_dev], y_dev)


Using TensorFlow backend.


CNN + Elmo+Embeddings -- 


Evaluación sobre dev


NameError: name 'model' is not defined

## Without ELMo

In [10]:
from keras.models import Model
from keras.layers import Dense, Embedding, Dropout, Conv1D, CuDNNGRU, Input, Concatenate, Bidirectional, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.preprocessing import sequence

embedding_dim = 1024

optimizer_args = {
    "lr": 0.0015,
    "decay": 0.005,
}


emb_input = Input(shape=X_emb_train[0].shape)

x = emb_input
x = Conv1D(filters=40, kernel_size=3, activation='relu')(x)
x = Dropout(0.60)(x)
x = GlobalMaxPooling1D()(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=emb_input, outputs=[output])

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 40, 300)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 38, 40)            36040     
_________________________________________________________________
dropout_2 (Dropout)          (None, 38, 40)            0         
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 40)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 41        
Total params: 36,081
Trainable params: 36,081
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint('/tmp/cnn_only_fasttext_model.h5', save_best_only=True, monitor='val_acc', verbose=1)
early_stopper = EarlyStopping(monitor='val_loss', patience=20)


model.fit(X_emb_train, y_train, 
          callbacks=[checkpointer, early_stopper],
          validation_data=(X_emb_dev, y_dev), epochs=250, batch_size=32)


Train on 4500 samples, validate on 500 samples
Epoch 1/250

Epoch 00001: val_acc improved from -inf to 0.75400, saving model to /tmp/cnn_model.h5
Epoch 2/250

Epoch 00002: val_acc did not improve from 0.75400
Epoch 3/250

Epoch 00003: val_acc improved from 0.75400 to 0.76400, saving model to /tmp/cnn_model.h5
Epoch 4/250

Epoch 00004: val_acc improved from 0.76400 to 0.76600, saving model to /tmp/cnn_model.h5
Epoch 5/250

Epoch 00005: val_acc improved from 0.76600 to 0.78200, saving model to /tmp/cnn_model.h5
Epoch 6/250

Epoch 00006: val_acc did not improve from 0.78200
Epoch 7/250

Epoch 00007: val_acc improved from 0.78200 to 0.78800, saving model to /tmp/cnn_model.h5
Epoch 8/250

Epoch 00008: val_acc did not improve from 0.78800
Epoch 9/250

Epoch 00009: val_acc did not improve from 0.78800
Epoch 10/250

Epoch 00010: val_acc did not improve from 0.78800
Epoch 11/250

Epoch 00011: val_acc did not improve from 0.78800
Epoch 12/250

Epoch 00012: val_acc did not improve from 0.78800
Ep

<keras.callbacks.History at 0x7efc6c566550>

In [2]:
from hate.utils import print_evaluation
print("CNN + Embeddings -- \n\n")
print("Evaluación sobre dev")

model.load_weights(checkpointer.filepath)

print_evaluation(model, X_emb_dev, y_dev)


CNN + Embeddings -- 


Evaluación sobre dev


NameError: name 'model' is not defined