# LSTM sin embeddings

Modelo básico con embeddings entrenados acá. Un baseline :-)

In [1]:
import pandas as pd
import csv

df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id", quoting=csv.QUOTE_NONE)
df_test = pd.read_table("../../../data/es/reference_es.tsv", header=None, 
                        names=["text", "HS", "TR", "AG"], quoting=csv.QUOTE_NONE)




text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]
text_test, y_test = df_test["text"], df_test["HS"]


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 100000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_dev = tokenizer.texts_to_sequences(text_dev)
X_test = tokenizer.texts_to_sequences(text_test)
max_length = 30

X_train = pad_sequences(X_train, max_length)
X_dev = pad_sequences(X_dev, max_length)
X_test = pad_sequences(X_test, max_length)

Using TensorFlow backend.


In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(num_words, embedding_vector_length, input_length=max_length))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=3, batch_size=32)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 32)            3200000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               12928     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 3,266,257
Trainable params: 3,266,257
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x7fd64c123898>

In [6]:
from hate.utils import print_evaluation

print("Evaluación sobre dev")
print_evaluation(model, X_dev, y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, X_test, y_test)



Evaluación sobre dev
Loss           : 0.8102
Accuracy       : 0.7140
Precision(1)   : 0.6586
Precision(1)   : 0.7689
Precision(avg) : 0.7138

Recall(1)      : 0.7387
Recall(0)      : 0.6942
Recall(avg)    : 0.7165

F1(1)          : 0.6964
F1(0)          : 0.7297
F1(avg)        : 0.7130


Evaluación sobre test
Loss           : 0.9839
Accuracy       : 0.6650
Precision(1)   : 0.5769
Precision(1)   : 0.7544
Precision(avg) : 0.6657

Recall(1)      : 0.7045
Recall(0)      : 0.6372
Recall(avg)    : 0.6709

F1(1)          : 0.6344
F1(0)          : 0.6909
F1(avg)        : 0.6626


## Bidirectional LSTM

In [9]:
from keras.layers import Bidirectional, CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(num_words, embedding_vector_length, input_length=max_length))
model.add(Bidirectional(CuDNNLSTM(100)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=5, batch_size=32)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 30, 32)            3200000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               107200    
_________________________________________________________________
dropout_9 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 129       
Total params: 3,333,057
Trainable params: 3,333,057
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x7fd588212ef0>

In [10]:
from hate.utils import print_evaluation

print("Evaluación sobre dev")
print_evaluation(model, X_dev, y_dev)
print("\n\nEvaluación sobre test")
print_evaluation(model, X_test, y_test)



Evaluación sobre dev
Loss           : 1.1604
Accuracy       : 0.7120
Precision(1)   : 0.7120
Precision(1)   : 0.7120
Precision(avg) : 0.7120

Recall(1)      : 0.5901
Recall(0)      : 0.8094
Recall(avg)    : 0.6997

F1(1)          : 0.6453
F1(0)          : 0.7576
F1(avg)        : 0.7014


Evaluación sobre test
Loss           : 1.3687
Accuracy       : 0.6881
Precision(1)   : 0.6344
Precision(1)   : 0.7203
Precision(avg) : 0.6773

Recall(1)      : 0.5758
Recall(0)      : 0.7670
Recall(avg)    : 0.6714

F1(1)          : 0.6037
F1(0)          : 0.7429
F1(avg)        : 0.6733
