# LSTM con embeddings + double translated

Modelo básico con los embeddings de fasttext

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
df_train = pd.read_table("../../../data/es/train_es.tsv", index_col="id")
df_dev = pd.read_table("../../../data/es/dev_es.tsv", index_col="id")

text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]

In [2]:
train_synth_en = pd.read_table("../../../data/es/train_es.synth.en.tsv", index_col="id")
train_synth_fr = pd.read_table("../../../data/es/train_es.synth.fr.tsv", index_col="id")

text_train_synth_en, y_train_synth_en = train_synth_en["text"], train_synth_en["HS"]
text_train_synth_fr, y_train_synth_fr = train_synth_fr["text"], train_synth_fr["HS"]

Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 200000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_dev = tokenizer.texts_to_sequences(text_dev)

X_train_synth_en = tokenizer.texts_to_sequences(text_train_synth_en)
X_train_synth_fr = tokenizer.texts_to_sequences(text_train_synth_fr)

max_length = 30

X_train = pad_sequences(X_train, max_length)
X_dev = pad_sequences(X_dev, max_length)
X_train_synth_en = pad_sequences(X_train_synth_en, max_length)
X_train_synth_fr = pad_sequences(X_train_synth_fr, max_length)

Using TensorFlow backend.


Carguemos embeddings

In [4]:
import os
from glob import glob

path_to_embeddings = os.path.expanduser("~/WordVectors/")

print("Available embeddings: ", glob(os.path.join(path_to_embeddings, "*.vec")))

Available embeddings:  ['/home/jmperez/WordVectors/UBA_w5_200.vec', '/home/jmperez/WordVectors/wiki.es.vec', '/home/jmperez/WordVectors/UBA_w5_300.vec']


# Twitter Embeddings

In [5]:
import numpy as np

word_to_vec = {}

with open(os.path.join(path_to_embeddings, "UBA_w5_300.vec")) as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            vec = np.asarray(values[1:], dtype="float32")
        except:
            print(("*" * 80  + "\n")*3)
            print("Problema con la sig línea:")
            print(values[:10])
            word = values[1]
            vec = np.asarray(values[2:], dtype="float32")
        word_to_vec[word] = vec
        
embedding_size = len(word_to_vec["hola"])

********************************************************************************
********************************************************************************
********************************************************************************

Problema con la sig línea:
['.', '.', '-0.22232', '0.0052569', '0.47066', '0.13836', '0.15991', '0.19504', '0.00067885', '0.020299']
********************************************************************************
********************************************************************************
********************************************************************************

Problema con la sig línea:
['.', '...', '-0.11666', '-0.083768', '0.028919', '0.29973', '0.21017', '0.27808', '0.063251', '0.090223']
********************************************************************************
********************************************************************************
********************************************************************************

P

In [6]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_to_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_matrix.shape

(200000, 300)

# Sin Synths


In [8]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(LSTM(100, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=10, batch_size=32)


Train on 4469 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0b545207b8>

In [9]:
from hate.utils import print_evaluation

print_evaluation(model, X_dev, y_dev.values)

Loss        : 0.5496
Accuracy    : 0.7780
Precision   : 0.7303
Recall      : 0.7928
F1          : 0.7603


## Usando los synths generados de entrenamiento (sólo inglés)

In [10]:

X_tr = np.vstack((X_train, X_train_synth_en))
y_tr = np.vstack((y_train.values.reshape(-1, 1), 
                  y_train_synth_en.values.reshape(-1, 1),
))

In [11]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_tr, y_tr, 
          validation_data=(X_dev, y_dev), epochs=10, batch_size=32)


Train on 5960 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0ba8cf5828>

In [12]:
from hate.utils import print_evaluation

print_evaluation(model, X_dev, y_dev.values)

Loss        : 0.6460
Accuracy    : 0.7680
Precision   : 0.7624
Recall      : 0.6937
F1          : 0.7264


## Usando Inglés + Francés

In [13]:

X_tr = np.vstack((X_train, X_train_synth_en, X_train_synth_fr))
y_tr = np.vstack((
    y_train.values.reshape(-1, 1), 
    y_train_synth_en.values.reshape(-1, 1),
    y_train_synth_fr.values.reshape(-1, 1)
))

In [14]:

model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_tr, y_tr, 
          validation_data=(X_dev, y_dev), epochs=10, batch_size=32)


Train on 7137 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0ba744a630>

In [15]:
from hate.utils import print_evaluation

print_evaluation(model, X_dev, y_dev.values)

Loss        : 0.6197
Accuracy    : 0.7460
Precision   : 0.6971
Recall      : 0.7568
F1          : 0.7257
