# LSTM con embeddings

Modelo básico con los embeddings de fasttext

In [1]:
%ls ../data/dev_es/

dev_es.tsv  train_es.tsv


In [2]:
import pandas as pd
df_train = pd.read_table("../../../data/dev_es/train_es.tsv", index_col="id")
df_dev = pd.read_table("../../../data/dev_es/dev_es.tsv", index_col="id")

text_train, y_train = df_train["text"], df_train["HS"]
text_dev, y_dev = df_dev["text"], df_dev["HS"]

Tengo que hacer dos cosas:

- Primero, convertir los tweets a secuencias de texto
- Luego, paddear las secuencias a cierta longitud (Keras necesita esto para poder paralelizar cálculo)

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 200000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_dev = tokenizer.texts_to_sequences(text_dev)

max_length = 30

X_train = pad_sequences(X_train, max_length)
X_dev = pad_sequences(X_dev, max_length)

Using TensorFlow backend.


Carguemos embeddings

In [4]:
import os
from glob import glob

path_to_embeddings = os.path.expanduser("/home/jmperez/WordVectors/")

print("Available embeddings: ", glob(os.path.join(path_to_embeddings, "*.vec")))

Available embeddings:  ['/home/jmperez/WordVectors/UBA_w5_200.vec', '/home/jmperez/WordVectors/wiki.es.vec', '/home/jmperez/WordVectors/UBA_w5_300.vec']


# Wikipedia Embeddings

Me quedo sólo con los embeddings de unigramas...

In [5]:
import numpy as np

word_to_vec = {}

with open(os.path.join(path_to_embeddings, "wiki.es.vec")) as f:
    for i, line in enumerate(f):
        values = line.split()
        word = values[0]
        try:
            vec = np.asarray(values[1:], dtype="float32")
            word_to_vec[word] = vec
        except:
            continue
        
        
embedding_size = len(word_to_vec["hola"])

In [6]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_to_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_matrix.shape

(200000, 300)

In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(LSTM(100))
model.add(Dropout(0.65))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, 
          validation_data=(X_dev, y_dev), 
          epochs=20, batch_size=32)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 300)           60000000  
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               12928     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 60,173,457
Trainable params: 173,457
Non-trainable params: 60,000,000
__________________________________________________________

<keras.callbacks.History at 0x7f06445017f0>

In [9]:
loss, accuracy = model.evaluate(X_dev, y_dev)

print("Loss function: {:.3f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy*100))

Loss function: 0.577
Accuracy: 79.20%


## Bidirectional LSTM

In [10]:
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(num_words, embedding_size, input_length=max_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=10, batch_size=32)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 300)           60000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               320800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 60,346,657
Trainable params: 346,657
Non-trainable params: 60,000,000
__________________________________________________________

<keras.callbacks.History at 0x7f45bc2067f0>

In [12]:
loss, accuracy = model.evaluate(X_dev, y_dev)

print("Loss function: {:.3f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy*100))

Loss function: 0.644
Accuracy: 76.60%
