Prerequisitos: Preprocessing, featurize

En esta libreta cargamos los features textuales, calculamos los embeddings, y entrenamos el modelo de deep learning

### 0. Librerías, funciones, random seed, etc.

In [1]:
# Librerias

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow.keras.models
from nltk.corpus import stopwords

from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM, Bidirectional, concatenate
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tokenizer import tokenizer as reddit_tokenizer

In [2]:
# Funciones utiles
import sys
import os
import pickle

pickle_path = "/datos/ecampillo/jupyter/dl-notebooks/pickles"

def logger(message, debug_file="log.txt"):
    print(message)
    original_stdout = sys.stdout # Save a reference to the original standard output
    with open(debug_file, 'a') as f:
        sys.stdout = f # Change the standard output to the file we created.
        print(message)
        sys.stdout = original_stdout # Reset the standard output to its original value
        
def save_pickle(filepath, filename, data):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    file = os.path.join(filepath, filename)
    with open(file, 'wb') as data_file:
        pickle.dump(data, data_file)
        
def load_pickle(filepath, filename):
    file = os.path.join(filepath, filename)
    with open(file, 'rb') as data_file:
        data = pickle.load(data_file)
    return data

In [3]:
import numpy
import tensorflow
import sys

from numpy.random import seed
seed(42)
tensorflow.random.set_seed(42) 
logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


### 1. Loading data

In [4]:
X_train = load_pickle(pickle_path, "X_train.pkl")
X_test = load_pickle(pickle_path, "X_test.pkl")
y_train = load_pickle(pickle_path, "y_train.pkl")
y_test = load_pickle(pickle_path, "y_test.pkl")

feats_train = load_pickle(pickle_path, "feats_train.pkl")
feats_test = load_pickle(pickle_path, "feats_test.pkl")

### 2. Preparing the Embeddings layer

In [5]:
%%time

tokenizer = Tokenizer(num_words=50000) # 5000
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

CPU times: user 6.32 s, sys: 78.6 ms, total: 6.4 s
Wall time: 6.4 s


In [6]:
# Padding

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 50000  # podria ser la que quisieramos  # antes tenia 10000, voy a probar con 50000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [7]:
%%time
# Loading the GloVe embeddings

from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/datos/erisk/deep-learning/embeddings/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

CPU times: user 10.6 s, sys: 933 ms, total: 11.6 s
Wall time: 10.5 s


In [8]:
%%time
# Creating an embedding matrix

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

CPU times: user 163 ms, sys: 86.8 ms, total: 249 ms
Wall time: 247 ms


In [9]:
save_pickle(pickle_path, "embedding_matrix.pkl", embedding_matrix)

### 3. Preparing and training model

In [10]:
#Bidirectional LSTM model (takes too long to train)
meta_input = Input(shape=(1,))
nlp_input = Input(shape=(maxlen,)) 
emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
nlp_out = Bidirectional(LSTM(128))(emb)
concat = concatenate([nlp_out, meta_input])
classifier = Dense(32, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(classifier)
model = Model(inputs=[nlp_input , meta_input], outputs=[output])

In [15]:
#CNN model
meta_input = Input(shape=(1,))
nlp_input = Input(shape=(maxlen,))
emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
nlp_out = Conv1D(64, 5, activation='relu')(emb)
max_pool = GlobalMaxPooling1D()(nlp_out)
concat = concatenate([max_pool, meta_input])
classifier = Dense(32, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(classifier)
model = Model(inputs=[nlp_input, meta_input], outputs=[output])

In [16]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [17]:
history = model.fit([X_train, feats_train.values()], y_train, batch_size=2, epochs=10, verbose=1, validation_split=0.2, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
