# GRU 

Usamos UBA.

In [1]:
import pandas as pd
import csv
import numpy as np
import tensorflow as tf
import random

np.random.seed(2019)
tf.random.set_random_seed(2019)
random.seed(2019)

df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_test = pd.read_table("../../../data/es/reference_es.tsv", header=None, 
                        names=["text", "HS", "TR", "AG"], quoting=csv.QUOTE_NONE)


text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]
text_test, y_test = df_test["text"], df_test["HS"]

print("Instancias de entrenamiento: {}".format(len(df_train)))
print("Instancias de desarrollo: {}".format(len(df_dev)))


Instancias de entrenamiento: 4500
Instancias de desarrollo: 500


Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 200000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_dev = tokenizer.texts_to_sequences(text_dev)
X_test = tokenizer.texts_to_sequences(text_test)
max_length = 30

X_train = pad_sequences(X_train, max_length)
X_dev = pad_sequences(X_dev, max_length)
X_test = pad_sequences(X_test, max_length)

Using TensorFlow backend.


Carguemos embeddings

In [3]:
import os
from glob import glob

path_to_embeddings = os.path.expanduser("/home/jmperez/WordVectors/")

print("Available embeddings: ", glob(os.path.join(path_to_embeddings, "*.vec")))

Available embeddings:  ['/home/jmperez/WordVectors/UBA_w5_200.vec', '/home/jmperez/WordVectors/wiki.es.vec', '/home/jmperez/WordVectors/UBA_w5_300.vec']


# Twitter Embeddings

In [4]:
import numpy as np

word_to_vec = {}

with open(os.path.join(path_to_embeddings, "UBA_w5_300.vec")) as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            vec = np.asarray(values[1:], dtype="float32")
        except:
            print(("*" * 80  + "\n")*3)
            print("Problema con la sig línea:")
            print(values[:10])
            word = values[1]
            vec = np.asarray(values[2:], dtype="float32")
        word_to_vec[word] = vec
        
embedding_size = len(word_to_vec["hola"])

********************************************************************************
********************************************************************************
********************************************************************************

Problema con la sig línea:
['.', '.', '-0.22232', '0.0052569', '0.47066', '0.13836', '0.15991', '0.19504', '0.00067885', '0.020299']
********************************************************************************
********************************************************************************
********************************************************************************

Problema con la sig línea:
['.', '...', '-0.11666', '-0.083768', '0.028919', '0.29973', '0.21017', '0.27808', '0.063251', '0.090223']
********************************************************************************
********************************************************************************
********************************************************************************

P

In [5]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_to_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [6]:
embedding_matrix.shape

(200000, 300)

In [7]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import CuDNNGRU, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import Adam

model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(CuDNNGRU(100))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

optimizer_args = {
    "lr": 0.001,
    "decay": 0.01
}

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=20, batch_size=32)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 300)           60000000  
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, 100)               120600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               12928     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 60,133,657
Trainable params: 133,657
Non-trainable params: 60,000,000
_________________________________________________________________
None
Train on 4500 samples, validate on 500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 

<keras.callbacks.History at 0x7f473b39cd30>

In [8]:
from hate.utils import print_evaluation

print("Evaluación sobre dev")
print_evaluation(model, X_dev, y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, X_test, y_test)



Evaluación sobre dev
Loss           : 0.5410
Accuracy       : 0.7440
Precision(1)   : 0.6754
Precision(1)   : 0.8233
Precision(avg) : 0.7493

Recall(1)      : 0.8153
Recall(0)      : 0.6871
Recall(avg)    : 0.7512

F1(1)          : 0.7388
F1(0)          : 0.7490
F1(avg)        : 0.7439


Evaluación sobre test
Loss           : 0.6712
Accuracy       : 0.6687
Precision(1)   : 0.5721
Precision(1)   : 0.7937
Precision(avg) : 0.6829

Recall(1)      : 0.7818
Recall(0)      : 0.5894
Recall(avg)    : 0.6856

F1(1)          : 0.6607
F1(0)          : 0.6764
F1(avg)        : 0.6686


## Bidirectional GRU

In [9]:
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(CuDNNGRU(100)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

optimizer_args = {
    "lr": 0.001,
    "decay": 0.01
}

model.compile(loss='binary_crossentropy', 
              optimizer=Adam(**optimizer_args), 
              metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=20, batch_size=32)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 300)           60000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               241200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 60,267,057
Trainable params: 267,057
Non-trainable params: 60,000,000
__________________________________________________________

<keras.callbacks.History at 0x7f45d80faf60>

In [10]:
from hate.utils import print_evaluation

print("Evaluación sobre dev")
print_evaluation(model, X_dev, y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, X_test, y_test)



Evaluación sobre dev
Loss           : 0.5180
Accuracy       : 0.7560
Precision(1)   : 0.7083
Precision(1)   : 0.8000
Precision(avg) : 0.7542

Recall(1)      : 0.7658
Recall(0)      : 0.7482
Recall(avg)    : 0.7570

F1(1)          : 0.7359
F1(0)          : 0.7732
F1(avg)        : 0.7546


Evaluación sobre test
Loss           : 0.6177
Accuracy       : 0.6906
Precision(1)   : 0.6002
Precision(1)   : 0.7864
Precision(avg) : 0.6933

Recall(1)      : 0.7485
Recall(0)      : 0.6500
Recall(avg)    : 0.6992

F1(1)          : 0.6662
F1(0)          : 0.7117
F1(avg)        : 0.6890
