Prerequisitos: Preprocessing, featurize

En esta libreta cargamos los features textuales, calculamos los embeddings, y entrenamos el modelo de deep learning

### 0. Librerías, funciones, random seed, etc.

In [1]:
# Librerias

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow.keras.models
from nltk.corpus import stopwords

from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM, Bidirectional, concatenate
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
#from tokenizer import tokenizer as reddit_tokenizer
# my modules
from utils import *

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
import numpy
import tensorflow
import sys
import random as rn

np.random.seed(42)
rn.seed(12345)
tensorflow.random.set_seed(42) 

logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


### 1. Loading data

In [4]:
X_train = load_pickle(pickle_path, "X_train.pkl")
X_test = load_pickle(pickle_path, "X_test.pkl")
y_train = load_pickle(pickle_path, "y_train.pkl")
y_test = load_pickle(pickle_path, "y_test.pkl")

### 2. Preparing the Embeddings layer

In [5]:
tokenizer = Tokenizer(num_words=50000) # 5000
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [6]:
# Padding

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 50000  # podria ser la que quisieramos  # antes tenia 10000, voy a probar con 50000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [7]:
# Loading the GloVe embeddings

from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/datos/erisk/deep-learning/embeddings/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [8]:
# Creating an embedding matrix

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [9]:
save_pickle(pickle_path, "embedding_matrix.pkl", embedding_matrix)

### Non-DL models

In [None]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42,probability=True)

### 3. Preparing and training model

In [9]:
feats_train = load_pickle(pickle_path, "feats_train.pkl").values
feats_test = load_pickle(pickle_path, "feats_test.pkl").values

In [10]:
meta_input_len = len(feats_train[1,])

In [12]:
#Bidirectional LSTM model (takes too long to train)
meta_input = Input(shape=(meta_input_len,))
nlp_input = Input(shape=(maxlen,)) 
emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
nlp_out = Bidirectional(LSTM(128))(emb)
concat = concatenate([nlp_out, meta_input])
classifier = Dense(32, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(classifier)
model_lstm = Model(inputs=[nlp_input , meta_input], outputs=[output])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [13]:
#history = model_lstm.fit([X_train, feats_train.values()], y_train, batch_size=2, epochs=10, verbose=1, validation_split=0.2, shuffle=True)

In [11]:
#CNN model
meta_input = Input(shape=(meta_input_len,))
nlp_input = Input(shape=(maxlen,))
emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
nlp_out = Conv1D(64, 5, activation='relu')(emb)
max_pool = GlobalMaxPooling1D()(nlp_out)
concat = concatenate([max_pool, meta_input])
classifier = Dense(32, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(classifier)
model_cnn = Model(inputs=[nlp_input, meta_input], outputs=[output])

model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
#training with feats_train.values()
history = model_cnn.fit([X_train, feats_train], y_train, batch_size=2, epochs=10, verbose=1, validation_split=0.2, shuffle=True)

Epoch 1/10

### 4. Evaluating model

In [None]:
def evaluate_model(model):
    score = model.evaluate([X_test, feats_test], y_test, verbose=1)
    logger("Test Score: {}".format(score[0]))
    logger("Test Accuracy: {}".format(score[1]))

    from sklearn.metrics import classification_report, confusion_matrix
    import numpy as np

    y_pred = model.predict([X_test, feats_test], batch_size=2, verbose=1)
    if y_pred.shape[-1] > 1:
        y_pred_label = y_pred.argmax(axis=-1)
    else:
        print("Entered here")
        y_pred_label = (y_pred > 0.5).astype('int32')

    from sklearn.metrics import classification_report, confusion_matrix

    logger(classification_report(y_test, y_pred_label))
    logger(confusion_matrix(y_test, y_pred_label))

In [None]:
evaluate_model(model_cnn)

In [None]:
evaluate_model(model_lstm)

### 5. Model with no text features for comparison

In [1]:
#CNN model
nlp_input = Input(shape=(maxlen,))
emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
nlp_out = Conv1D(64, 5, activation='relu')(emb)
max_pool = GlobalMaxPooling1D()(nlp_out)
#concat = concatenate([max_pool, meta_input])
classifier = Dense(32, activation='relu')(max_pool)
output = Dense(1, activation='sigmoid')(classifier)
model_cnn_simple = Model(inputs=nlp_input, outputs=output)

model_cnn_simple.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

#Bidirectional LSTM model
nlp_input = Input(shape=(maxlen,)) 
emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)(nlp_input)
nlp_out = Bidirectional(LSTM(128))(emb)
classifier = Dense(32, activation='relu')(nlp_out)
output = Dense(1, activation='sigmoid')(classifier)
model_lstm = Model(inputs=[nlp_input , meta_input], outputs=[output])

model_lstm_simple.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

NameError: name 'Input' is not defined

In [None]:
history = model_cnn_simple.fit(X_train, y_train, batch_size=2, epochs=10, verbose=1, validation_split=0.2, shuffle=True)

score = model_cnn_simple.evaluate(X_test, y_test, verbose=1)
logger("Test Score: {}".format(score[0]))
logger("Test Accuracy: {}".format(score[1]))

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [None]:
history = model_lstm_simple.fit(X_train, y_train, batch_size=2, epochs=10, verbose=1, validation_split=0.2, shuffle=True)

score = model_cnn_simple.evaluate(X_test, y_test, verbose=1)
logger("Test Score: {}".format(score[0]))
logger("Test Accuracy: {}".format(score[1]))

In [None]:
evaluate_model(model_cnn_simple)

In [None]:
evaluate_model(model_lstm_simple)