In [1]:
import numpy as np
import os
import pickle

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation, Concatenate
from keras.layers import Embedding, LSTM, Bidirectional, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint, CSVLogger
from keras.callbacks import LearningRateScheduler as LRS
from keras.models import load_model
from keras.utils import plot_model

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import string
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from transformers import TFBertForSequenceClassification

In [None]:
from numba import cuda
device = cuda.get_current_device()
device.reset()

In [3]:
def configure_callbacks(model_id, kpi_to_monitor='val_accuracy'):
    # Without log/ or models/ subfolder as not possible to access unexisting folders
    # If possible to train with jupyter revise
    path = "models"
    name = "ap"
    
    log_filename = '%s/%s-%s.log' % (path,name, model_id)
    csv_logger = CSVLogger(log_filename)
    
    chk_1_model_filename = '%s/%s-%s-{epoch:04d}-{%s:.6f}.h5' % (path, name, model_id, kpi_to_monitor)
    chk_2_model_filename = '%s/%s-%s.h5' % (path, name, model_id)
    
    # Save best model fully not only weights after each epoch (period=1) 
    # with best accuracy value (mode=max, save_best_only=True)
    checkpoint1 = ModelCheckpoint(
        chk_1_model_filename,
        monitor=kpi_to_monitor,
        save_best_only=True,
        save_weights_only=False, 
        verbose=1, mode='max', period=1
    )

    checkpoint2 = ModelCheckpoint(
        chk_2_model_filename, 
        monitor=kpi_to_monitor,
        save_best_only=False,
        save_weights_only=False, 
        verbose=1, mode='auto', period=1
    )

    callbacks = [csv_logger, checkpoint1, checkpoint2]
    return callbacks

In [4]:
######### AUTHOR PROFILING FUNCTION FOR JOINING PREDICTIONS #########

# de 20 0s y 20 1s

def author_profiling_report(author_predictions, number_authors=40):
    n = int(number_authors/2)
    a = np.zeros(n)
    b = np.ones(n)        
    author_profile = np.concatenate([a,b])
    
    # Check author profiling -> 8,000 predictions
    # Split into 40 authors -> 200 tweets per author
    author_predictions = np.average(np.array_split(author_predictions, number_authors), axis=1)
    author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

    print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))
    return author_predictions

### Load Data

In [5]:
pickle_file = open('es_indv.pickle', 'rb')
es_indv = pickle.load(pickle_file)

embedding_matrix_es_indv = es_indv[0]
train_padded_es_indv, y_train_es_indv = es_indv[1], es_indv[2]
valid_padded_es_indv, y_valid_es_indv = es_indv[3], es_indv[4]
test_padded_es_indv = es_indv[5]

In [6]:
pickle_file = open('en_indv.pickle', 'rb')
en_indv = pickle.load(pickle_file)

embedding_matrix_en_indv = en_indv[0]
train_padded_en_indv, y_train_en_indv = en_indv[1], en_indv[2]
valid_padded_en_indv, y_valid_en_indv = en_indv[3], en_indv[4]
test_padded_en_indv = en_indv[5]

In [7]:
pickle_file = open('es_20.pickle', 'rb')
es_20 = pickle.load(pickle_file)

train_padded_es_20, y_train_es_20 = es_20[0], es_20[1]
valid_padded_es_20, y_valid_es_20 = es_20[2], es_20[3]
test_padded_es_20 = es_20[4]

In [8]:
pickle_file = open('en_20.pickle', 'rb')
en_20 = pickle.load(pickle_file)

train_padded_en_20, y_train_en_20 = en_20[0], en_20[1]
valid_padded_en_20, y_valid_en_20 = en_20[2], en_20[3]
test_padded_en_20 = en_20[4]

In [9]:
pickle_file = open('es_cml.pickle', 'rb')
es_cml = pickle.load(pickle_file)

x_train_es, y_train_es = es_cml[0], es_cml[1]
x_valid_es, y_valid_es = es_cml[2], es_cml[3]
x_test_es = es_cml[4]

In [10]:
pickle_file = open('en_cml.pickle', 'rb')
en_cml = pickle.load(pickle_file)

x_train_en, y_train_en = en_cml[0], en_cml[1]
x_valid_en, y_valid_en = en_cml[2], en_cml[3]
x_test_en = en_cml[4]

In [17]:
pickle_file = open('es_indv_bert.pickle', 'rb')
es_indv_bert = pickle.load(pickle_file)

train_padded_es_indv_bert, train_mask_es_indv, y_train_es = es_indv_bert[0], es_indv_bert[1], es_indv_bert[2]
valid_padded_es_indv_bert, valid_mask_es_indv, y_valid_es = es_indv_bert[3], es_indv_bert[4], es_indv_bert[5]
test_padded_es_indv_bert, test_mask_es_indv = es_indv_bert[6], es_indv_bert[7]

In [18]:
pickle_file = open('es_20_bert.pickle', 'rb')
es_20_bert = pickle.load(pickle_file)

train_padded_es_20_bert, train_mask_es_20, y_train_es = es_20_bert[0], es_20_bert[1], es_20_bert[2]
valid_padded_es_20_bert, valid_mask_es_20, y_valid_es = es_20_bert[3], es_20_bert[4], es_20_bert[5]
test_padded_es_20_bert, test_mask_es_20 = es_20_bert[6], es_20_bert[7]

### RNN Models

#### Individual tweets

In [6]:
# ES INDVIDUAL TWEETS - PRETRAINED EMB
max_words = 10000
word_embedding_size = 300
max_length = 15

model1 = Sequential()
model1.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    weights=[embedding_matrix_es_indv], # embeddings
                    input_length=max_length,
                    trainable=False,
                    mask_zero=True))
model1.add(LSTM(units=128))
model1.add(Dense(1, activation='sigmoid'))

model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 300)           3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,219,777
Trainable params: 219,777
Non-trainable params: 3,000,000
_________________________________________________________________


In [8]:
model1.fit(x=train_padded_es_indv,
          y=y_train_es_indv,
          validation_data=(valid_padded_es_indv, y_valid_es_indv),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model1.predict(valid_padded_es_indv)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.86      0.60      0.71        20
        hate       0.69      0.90      0.78        20

    accuracy                           0.75        40
   macro avg       0.77      0.75      0.74        40
weighted avg       0.77      0.75      0.74        40



In [9]:
# ES INDVIDUAL TWEETS - PRETRAINED EMB TRAINABLE
max_words = 10000
word_embedding_size = 300
max_length = 15

model2 = Sequential()
model2.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    weights=[embedding_matrix_es_indv], # embeddings
                    input_length=max_length,
                    trainable=True,
                    mask_zero=True))
model2.add(LSTM(units=128))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 300)           3000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 3,219,777
Trainable params: 3,219,777
Non-trainable params: 0
_________________________________________________________________


In [10]:
model2.fit(x=train_padded_es_indv,
          y=y_train_es_indv,
          validation_data=(valid_padded_es_indv, y_valid_es_indv),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model2.predict(valid_padded_es_indv)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.83      0.75      0.79        20
        hate       0.77      0.85      0.81        20

    accuracy                           0.80        40
   macro avg       0.80      0.80      0.80        40
weighted avg       0.80      0.80      0.80        40



In [11]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB
max_words = 10000
word_embedding_size = 300
max_length = 15

model3 = Sequential()
model3.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_es_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model3.add(LSTM(units=128))
model3.add(Dense(1, activation='sigmoid'))

model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 15, 300)           3000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 3,219,777
Trainable params: 3,219,777
Non-trainable params: 0
_________________________________________________________________


In [12]:
model3.fit(x=train_padded_es_indv,
          y=y_train_es_indv,
          validation_data=(valid_padded_es_indv, y_valid_es_indv),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model3.predict(valid_padded_es_indv)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.79      0.75      0.77        20
        hate       0.76      0.80      0.78        20

    accuracy                           0.78        40
   macro avg       0.78      0.78      0.77        40
weighted avg       0.78      0.78      0.77        40



In [13]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB BI
max_words = 10000
word_embedding_size = 300
max_length = 15

model4 = Sequential()
model4.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model4.add(Bidirectional(LSTM(units=128, dropout=0.05, recurrent_dropout=0.2)))
model4.add(Dense(1, activation='sigmoid'))

model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 15, 300)           3000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               439296    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 3,439,553
Trainable params: 3,439,553
Non-trainable params: 0
_________________________________________________________________


In [14]:
model4.fit(x=train_padded_es_indv,
          y=y_train_es_indv,
          validation_data=(valid_padded_es_indv, y_valid_es_indv),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model4.predict(valid_padded_es_indv)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.79      0.75      0.77        20
        hate       0.76      0.80      0.78        20

    accuracy                           0.78        40
   macro avg       0.78      0.78      0.77        40
weighted avg       0.78      0.78      0.77        40



In [6]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 15

model7 = Sequential()
model7.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model7.add(LSTM(units=128, return_sequences=True))
model7.add(LSTM(units=128, return_sequences=True))
model7.add(LSTM(units=128, return_sequences=True))
model7.add(LSTM(units=128))
model7.add(Dense(1, activation='sigmoid'))

model7.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model7.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 300)           3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 15, 128)           219648    
_________________________________________________________________
lstm_1 (LSTM)                (None, 15, 128)           131584    
_________________________________________________________________
lstm_2 (LSTM)                (None, 15, 128)           131584    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,614,529
Trainable params: 3,614,529
Non-trainable params: 0
______________________________________________

In [7]:
model7.fit(x=train_padded_es_indv,
          y=y_train_es_indv,
          validation_data=(valid_padded_es_indv, y_valid_es_indv),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model7.predict(valid_padded_es_indv)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.75      0.75      0.75        20
        hate       0.75      0.75      0.75        20

    accuracy                           0.75        40
   macro avg       0.75      0.75      0.75        40
weighted avg       0.75      0.75      0.75        40



In [8]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 15

model8 = Sequential()
model8.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512))
model8.add(Dense(1, activation='sigmoid'))

model8.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model8.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 300)           3000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 15, 512)           1665024   
_________________________________________________________________
lstm_5 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_6 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_7 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_8 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [9]:
model8.fit(x=train_padded_es_indv,
          y=y_train_es_indv,
          validation_data=(valid_padded_es_indv, y_valid_es_indv),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model8.predict(valid_padded_es_indv)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.88      0.75      0.81        20
        hate       0.78      0.90      0.84        20

    accuracy                           0.82        40
   macro avg       0.83      0.82      0.82        40
weighted avg       0.83      0.82      0.82        40



#### 20 joined tweets

In [15]:
# ES 20 JOINED TWEETS - TRAINABLE EMB
max_words = 10000
word_embedding_size = 300
max_length = 250

model5 = Sequential()
model5.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_es_indv], # embeddings
                    input_length=max_length,
                    trainable=False,
                    mask_zero=True))
model5.add(LSTM(units=128))
model5.add(Dense(1, activation='sigmoid'))

model5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model5.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 250, 300)          3000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 3,219,777
Trainable params: 219,777
Non-trainable params: 3,000,000
_________________________________________________________________


In [16]:
model5.fit(x=train_padded_es_20,
          y=y_train_es_20,
          validation_data=(valid_padded_es_20, y_valid_es_20),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model5.predict(valid_padded_es_20)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.74      1.00      0.85        20
        hate       1.00      0.65      0.79        20

    accuracy                           0.82        40
   macro avg       0.87      0.82      0.82        40
weighted avg       0.87      0.82      0.82        40



In [17]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB BI
max_words = 10000
word_embedding_size = 300
max_length = 250

model6 = Sequential()
model6.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model6.add(Bidirectional(LSTM(units=128, dropout=0.05, recurrent_dropout=0.2)))
model6.add(Dense(1, activation='sigmoid'))

model6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model6.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 250, 300)          3000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 3,439,553
Trainable params: 3,439,553
Non-trainable params: 0
_________________________________________________________________


In [18]:
model6.fit(x=train_padded_es_20,
          y=y_train_es_20,
          validation_data=(valid_padded_es_20, y_valid_es_20),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model6.predict(valid_padded_es_20)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.68      0.95      0.79        20
        hate       0.92      0.55      0.69        20

    accuracy                           0.75        40
   macro avg       0.80      0.75      0.74        40
weighted avg       0.80      0.75      0.74        40



In [10]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 250

model9 = Sequential()
model9.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model9.add(LSTM(units=128, return_sequences=True))
model9.add(LSTM(units=128, return_sequences=True))
model9.add(LSTM(units=128, return_sequences=True))
model9.add(LSTM(units=128))
model9.add(Dense(1, activation='sigmoid'))

model9.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model9.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 300)          3000000   
_________________________________________________________________
lstm_9 (LSTM)                (None, 250, 128)          219648    
_________________________________________________________________
lstm_10 (LSTM)               (None, 250, 128)          131584    
_________________________________________________________________
lstm_11 (LSTM)               (None, 250, 128)          131584    
_________________________________________________________________
lstm_12 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 3,614,529
Trainable params: 3,614,529
Non-trainable params: 0
____________________________________________

In [11]:
model9.fit(x=train_padded_es_20,
          y=y_train_es_20,
          validation_data=(valid_padded_es_20, y_valid_es_20),
          shuffle=True,
          epochs=25,
          batch_size=64)

predictions = model9.predict(valid_padded_es_20)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.61      1.00      0.75        20
        hate       1.00      0.35      0.52        20

    accuracy                           0.68        40
   macro avg       0.80      0.68      0.64        40
weighted avg       0.80      0.68      0.64        40



In [20]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 250

model10 = Sequential()
model10.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512))
model10.add(Dense(1, activation='sigmoid'))

model10.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model10.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 250, 512)          1665024   
_________________________________________________________________
lstm_1 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_2 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_3 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense (Dense)                (None, 1)                 5

In [21]:
model10.fit(x=train_padded_es_20,
          y=y_train_es_20,
          validation_data=(valid_padded_es_20, y_valid_es_20),
          shuffle=True,
          epochs=25,
          batch_size=32)

predictions = model10.predict(valid_padded_es_20)
author_profiling_report(predictions)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
              precision    recall  f1-score   support

    not hate       0.64      0.90      0.75        20
        hate       0.83      0.50      0.62        20

    accuracy                           0.70        40
   macro avg       0.74      0.70      0.69        40
weighted avg       0.74      0.70      0.69        40



### Bert

In [21]:
#  ES individual tweets

bert_model1 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model1.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=2e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model1.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_75 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
bert_model1.fit(x=[train_padded_es_indv_bert, train_mask_es_20],
               y=y_train_es,
               batch_size=32,
               epochs=4,
               validation_data=([valid_padded_es_20_bert, valid_mask_es_20], y_valid_es))

predictions = bert_model1.predict([test_padded_es_20_bert, test_mask_es_20], batch_size=32)
author_profiling_report(predictions)

In [None]:
# ES joined 20 tweets

bert_model2 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model2.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=2e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model2.summary()

In [None]:
bert_model2.fit(x=[train_inp, train_mask],
               y=train_label,
               batch_size=32,
               epochs=4,
               validation_data=([val_inp,val_mask],val_label))


predictions = bert_model2.predict([val_inp,val_mask],batch_size=32)
author_profiling_report(predictions)

### Classical ML

#### Train

In [22]:
# LogisticRegression

pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression())
])
    
param_grid = {"tfidf__ngram_range" : [(1,2),(1,3),(2,3),(3,4),(3,5),(3,6),(4,5)],
              "tfidf__max_df":[0.3,0.4,0.5,0.6,0.7,0.8,0.9],
              "tfidf__min_df":[1,2,3,5], # or percentages
              "tfidf__analyzer":["char_wb"], # n-grams
              "clf__C":[1,10,100,1000,10000]
             }

clf_lr = GridSearchCV(pipe,
                      param_grid,
                      cv=5,
                      n_jobs=-1,
                      verbose=2,
                      scoring="f1_macro")

clf_lr.fit(x_train_es, y_train_es)
print(clf_lr.best_score_)
clf_lr.best_params_

Fitting 5 folds for each of 980 candidates, totalling 4900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 46.3min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 61.5min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 78.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 98.3min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 119.8min
[Parallel(n_jobs=-1)]: Done 4900 out of 4900 | elapsed: 120.3min finished


0.6550097877356043


{'clf__C': 1,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.9,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (3, 6)}

#### Evaluation over validation

In [13]:
# Logistic Regression ES
''' BEST PARAMETERS

{'clf__C': 1,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.9,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (3, 6)}
'''

pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(3,6),
                                 max_df=0.9,
                                 min_df=1,
                                 analyzer="char_wb")),
        ("clf", LogisticRegression(C=1))
])

pipe.fit(x_train_es, y_train_es)
predictions = pipe.predict(x_valid_es)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.76      0.80      0.78        20
        hate       0.79      0.75      0.77        20

    accuracy                           0.78        40
   macro avg       0.78      0.78      0.77        40
weighted avg       0.78      0.78      0.77        40



In [16]:
# Logistic Regression EN

pipe.fit(x_train_en, y_train_en)
predictions = pipe.predict(x_valid_en)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.76      0.80      0.78        20
        hate       0.79      0.75      0.77        20

    accuracy                           0.78        40
   macro avg       0.78      0.78      0.77        40
weighted avg       0.78      0.78      0.77        40



### Final Training

#### Auxiliar Functions

In [9]:
# Define a learning rate scheduler
def scheduler(epoch):
    if epoch < 25:
        return 0.001
    elif epoch < 50:
        return 0.0005
    elif epoch < 75:
        return 0.0001
    else:
        return 0.00005

scheduler_lr = LRS(scheduler)

In [10]:
# Optimizer
adam = Adam(learning_rate=0.001)
epochs = 100

#### Selected models

In [11]:
# ES 20 JOINED TWEETS - TRAINABLE EMB
max_words = 10000
word_embedding_size = 300
max_length = 250

model5 = Sequential()
model5.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_es_indv], # embeddings
                    input_length=max_length,
                    trainable=False,
                    mask_zero=True))
model5.add(LSTM(units=128))
model5.add(Dense(1, activation='sigmoid'))

model5.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model5.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,219,777
Trainable params: 219,777
Non-trainable params: 3,000,000
_________________________________________________________________


In [12]:
# ES INDVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 15

model8 = Sequential()
model8.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512, return_sequences=True))
model8.add(LSTM(units=512))
model8.add(Dense(1, activation='sigmoid'))

model8.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model8.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 300)           3000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 15, 512)           1665024   
_________________________________________________________________
lstm_2 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_3 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 15, 512)           2099200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [13]:
# ES 20 JOINED TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 250

model10 = Sequential()
model10.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512))
model10.add(Dense(1, activation='sigmoid'))

model10.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model10.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 300)          3000000   
_________________________________________________________________
lstm_6 (LSTM)                (None, 250, 512)          1665024   
_________________________________________________________________
lstm_7 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_8 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_9 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_10 (LSTM)               (None, 512)               2099200   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                

#### Training and evaluation

##### ES 20 JOINED TWEETS - TRAINABLE EMB

In [14]:
model_5_es = model5
model_id = 'model_5_es'

model_5_es.fit(x=train_padded_es_20,
               y=y_train_es_20,
               batch_size=128,
               validation_data=(valid_padded_es_20, y_valid_es_20),
               shuffle=True,
               verbose=1,                
               epochs=epochs,
               callbacks=[configure_callbacks(model_id), scheduler_lr])

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.71250, saving model to models/ap-model_5_es-0001-0.712500.h5

Epoch 00001: saving model to models/ap-model_5_es.h5
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.71250 to 0.72750, saving model to models/ap-model_5_es-0002-0.727500.h5

Epoch 00002: saving model to models/ap-model_5_es.h5
Epoch 3/100
Epoch 00003: val_accuracy did not improve from 0.72750

Epoch 00003: saving model to models/ap-model_5_es.h5
Epoch 4/100
Epoch 00004: val_accuracy did not improve from 0.72750

Epoch 00004: saving model to models/ap-model_5_es.h5
Epoch 5/100
Epoch 00005: val_accuracy did not improve from 0.72750

Epoch 00005: saving model to models/ap-model_5_es.h5
Epoch 6/100
Epoch 00006: val_accuracy did not improve from 0.72750

Epoch 00006: saving model to models/ap-model_5_es.h5
Epoch 7/100
Epoch 00007: val_accuracy did not improve from 0.72750

Epoch 00007: saving model to models/ap-model_5_es.h5
Epoch 8/100
Epoch 00008: val_accuracy 

Epoch 24/100
Epoch 00024: val_accuracy did not improve from 0.73250

Epoch 00024: saving model to models/ap-model_5_es.h5
Epoch 25/100
Epoch 00025: val_accuracy did not improve from 0.73250

Epoch 00025: saving model to models/ap-model_5_es.h5
Epoch 26/100
Epoch 00026: val_accuracy did not improve from 0.73250

Epoch 00026: saving model to models/ap-model_5_es.h5
Epoch 27/100
Epoch 00027: val_accuracy did not improve from 0.73250

Epoch 00027: saving model to models/ap-model_5_es.h5
Epoch 28/100
Epoch 00028: val_accuracy did not improve from 0.73250

Epoch 00028: saving model to models/ap-model_5_es.h5
Epoch 29/100
Epoch 00029: val_accuracy did not improve from 0.73250

Epoch 00029: saving model to models/ap-model_5_es.h5
Epoch 30/100
Epoch 00030: val_accuracy did not improve from 0.73250

Epoch 00030: saving model to models/ap-model_5_es.h5
Epoch 31/100
Epoch 00031: val_accuracy did not improve from 0.73250

Epoch 00031: saving model to models/ap-model_5_es.h5
Epoch 32/100
Epoch 00032

Epoch 00072: val_accuracy did not improve from 0.73250

Epoch 00072: saving model to models/ap-model_5_es.h5
Epoch 73/100
Epoch 00073: val_accuracy did not improve from 0.73250

Epoch 00073: saving model to models/ap-model_5_es.h5
Epoch 74/100
Epoch 00074: val_accuracy did not improve from 0.73250

Epoch 00074: saving model to models/ap-model_5_es.h5
Epoch 75/100
Epoch 00075: val_accuracy did not improve from 0.73250

Epoch 00075: saving model to models/ap-model_5_es.h5
Epoch 76/100
Epoch 00076: val_accuracy did not improve from 0.73250

Epoch 00076: saving model to models/ap-model_5_es.h5
Epoch 77/100
Epoch 00077: val_accuracy did not improve from 0.73250

Epoch 00077: saving model to models/ap-model_5_es.h5
Epoch 78/100
Epoch 00078: val_accuracy did not improve from 0.73250

Epoch 00078: saving model to models/ap-model_5_es.h5
Epoch 79/100
Epoch 00079: val_accuracy did not improve from 0.73250

Epoch 00079: saving model to models/ap-model_5_es.h5
Epoch 80/100
Epoch 00080: val_accurac

<tensorflow.python.keras.callbacks.History at 0x7f4a546c0d68>

In [8]:
model_5_es = load_model('models/ap-model_5_es-0008-0.732500.h5')
predictions = model_5_es.predict(valid_padded_es_20)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.83      1.00      0.91        20
        hate       1.00      0.80      0.89        20

    accuracy                           0.90        40
   macro avg       0.92      0.90      0.90        40
weighted avg       0.92      0.90      0.90        40



##### EN 20 JOINED TWEETS - TRAINABLE EMB

In [15]:
model_5_en = model5
model_id = 'model_5_en'

model_5_en.fit(x=train_padded_en_20,
               y=y_train_es_20,
               batch_size=128,
               validation_data=(valid_padded_en_20, y_valid_en_20),
               shuffle=True,
               verbose=1,                
               epochs=epochs,
               callbacks=[configure_callbacks(model_id), scheduler_lr])

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.66750, saving model to models/ap-model_5_en-0001-0.667500.h5

Epoch 00001: saving model to models/ap-model_5_en.h5
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.66750 to 0.68750, saving model to models/ap-model_5_en-0002-0.687500.h5

Epoch 00002: saving model to models/ap-model_5_en.h5
Epoch 3/100
Epoch 00003: val_accuracy did not improve from 0.68750

Epoch 00003: saving model to models/ap-model_5_en.h5
Epoch 4/100
Epoch 00004: val_accuracy did not improve from 0.68750

Epoch 00004: saving model to models/ap-model_5_en.h5
Epoch 5/100
Epoch 00005: val_accuracy did not improve from 0.68750

Epoch 00005: saving model to models/ap-model_5_en.h5
Epoch 6/100
Epoch 00006: val_accuracy did not improve from 0.68750

Epoch 00006: saving model to models/ap-model_5_en.h5
Epoch 7/100
Epoch 00007: val_accuracy did not improve from 0.68750

Epoch 00007: saving model to models/ap-model_5_en.h5
Epoch 8/100
Epoch 00008: val_accuracy 

Epoch 00024: val_accuracy did not improve from 0.68750

Epoch 00024: saving model to models/ap-model_5_en.h5
Epoch 25/100
Epoch 00025: val_accuracy did not improve from 0.68750

Epoch 00025: saving model to models/ap-model_5_en.h5
Epoch 26/100
Epoch 00026: val_accuracy did not improve from 0.68750

Epoch 00026: saving model to models/ap-model_5_en.h5
Epoch 27/100
Epoch 00027: val_accuracy did not improve from 0.68750

Epoch 00027: saving model to models/ap-model_5_en.h5
Epoch 28/100
Epoch 00028: val_accuracy did not improve from 0.68750

Epoch 00028: saving model to models/ap-model_5_en.h5
Epoch 29/100
Epoch 00029: val_accuracy did not improve from 0.68750

Epoch 00029: saving model to models/ap-model_5_en.h5
Epoch 30/100
Epoch 00030: val_accuracy did not improve from 0.68750

Epoch 00030: saving model to models/ap-model_5_en.h5
Epoch 31/100
Epoch 00031: val_accuracy did not improve from 0.68750

Epoch 00031: saving model to models/ap-model_5_en.h5
Epoch 32/100
Epoch 00032: val_accurac

Epoch 00072: val_accuracy did not improve from 0.68750

Epoch 00072: saving model to models/ap-model_5_en.h5
Epoch 73/100
Epoch 00073: val_accuracy did not improve from 0.68750

Epoch 00073: saving model to models/ap-model_5_en.h5
Epoch 74/100
Epoch 00074: val_accuracy did not improve from 0.68750

Epoch 00074: saving model to models/ap-model_5_en.h5
Epoch 75/100
Epoch 00075: val_accuracy did not improve from 0.68750

Epoch 00075: saving model to models/ap-model_5_en.h5
Epoch 76/100
Epoch 00076: val_accuracy did not improve from 0.68750

Epoch 00076: saving model to models/ap-model_5_en.h5
Epoch 77/100
Epoch 00077: val_accuracy did not improve from 0.68750

Epoch 00077: saving model to models/ap-model_5_en.h5
Epoch 78/100
Epoch 00078: val_accuracy did not improve from 0.68750

Epoch 00078: saving model to models/ap-model_5_en.h5
Epoch 79/100
Epoch 00079: val_accuracy did not improve from 0.68750

Epoch 00079: saving model to models/ap-model_5_en.h5
Epoch 80/100
Epoch 00080: val_accurac

<tensorflow.python.keras.callbacks.History at 0x7f4a1c2e6d30>

In [12]:
model_5_en = load_model('models/ap-model_5_en-0002-0.687500.h5')
predictions = model_5_en.predict(valid_padded_en_20)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.65      1.00      0.78        20
        hate       1.00      0.45      0.62        20

    accuracy                           0.73        40
   macro avg       0.82      0.72      0.70        40
weighted avg       0.82      0.72      0.70        40



##### ES INDIVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET

In [16]:
model_8_es = model8
model_id = 'model_8_es'

model_8_es.fit(x=train_padded_es_indv,
               y=y_train_es_indv,               
               batch_size=128,
               validation_data=(valid_padded_es_indv, y_valid_es_indv),
               shuffle=True,
               verbose=1,                
               epochs=epochs,
               callbacks=[configure_callbacks(model_id), scheduler_lr])

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.47462, saving model to models/ap-model_8_es-0001-0.474625.h5

Epoch 00001: saving model to models/ap-model_8_es.h5
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.47462 to 0.53962, saving model to models/ap-model_8_es-0002-0.539625.h5

Epoch 00002: saving model to models/ap-model_8_es.h5
Epoch 3/100
Epoch 00003: val_accuracy did not improve from 0.53962

Epoch 00003: saving model to models/ap-model_8_es.h5
Epoch 4/100
Epoch 00004: val_accuracy did not improve from 0.53962

Epoch 00004: saving model to models/ap-model_8_es.h5
Epoch 5/100
Epoch 00005: val_accuracy improved from 0.53962 to 0.56775, saving model to models/ap-model_8_es-0005-0.567750.h5

Epoch 00005: saving model to models/ap-model_8_es.h5
Epoch 6/100
Epoch 00006: val_accuracy improved from 0.56775 to 0.58050, saving model to models/ap-model_8_es-0006-0.580500.h5

Epoch 00006: saving model to models/ap-model_8_es.h5
Epoch 7/100
Epoch 00007: val_accuracy imp

Epoch 00047: val_accuracy did not improve from 0.61387

Epoch 00047: saving model to models/ap-model_8_es.h5
Epoch 48/100
Epoch 00048: val_accuracy did not improve from 0.61387

Epoch 00048: saving model to models/ap-model_8_es.h5
Epoch 49/100
Epoch 00049: val_accuracy did not improve from 0.61387

Epoch 00049: saving model to models/ap-model_8_es.h5
Epoch 50/100
Epoch 00050: val_accuracy did not improve from 0.61387

Epoch 00050: saving model to models/ap-model_8_es.h5
Epoch 51/100
Epoch 00051: val_accuracy did not improve from 0.61387

Epoch 00051: saving model to models/ap-model_8_es.h5
Epoch 52/100
Epoch 00052: val_accuracy did not improve from 0.61387

Epoch 00052: saving model to models/ap-model_8_es.h5
Epoch 53/100
Epoch 00053: val_accuracy did not improve from 0.61387

Epoch 00053: saving model to models/ap-model_8_es.h5
Epoch 54/100
Epoch 00054: val_accuracy did not improve from 0.61387

Epoch 00054: saving model to models/ap-model_8_es.h5
Epoch 55/100
Epoch 00055: val_accurac

Epoch 00095: val_accuracy did not improve from 0.61387

Epoch 00095: saving model to models/ap-model_8_es.h5
Epoch 96/100
Epoch 00096: val_accuracy did not improve from 0.61387

Epoch 00096: saving model to models/ap-model_8_es.h5
Epoch 97/100
Epoch 00097: val_accuracy did not improve from 0.61387

Epoch 00097: saving model to models/ap-model_8_es.h5
Epoch 98/100
Epoch 00098: val_accuracy did not improve from 0.61387

Epoch 00098: saving model to models/ap-model_8_es.h5
Epoch 99/100
Epoch 00099: val_accuracy did not improve from 0.61387

Epoch 00099: saving model to models/ap-model_8_es.h5
Epoch 100/100
Epoch 00100: val_accuracy did not improve from 0.61387

Epoch 00100: saving model to models/ap-model_8_es.h5


<tensorflow.python.keras.callbacks.History at 0x7f4a1c2b4ac8>

In [13]:
model_8_es = load_model('models/ap-model_8_es-0010-0.613875.h5')
predictions = model_8_es.predict(valid_padded_es_indv)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.87      0.65      0.74        20
        hate       0.72      0.90      0.80        20

    accuracy                           0.78        40
   macro avg       0.79      0.78      0.77        40
weighted avg       0.79      0.78      0.77        40



##### EN INDIVIDUAL TWEETS - TRAINABLE EMB COMPLEXER NET

In [17]:
model_8_en = model8
model_id = 'model_8_en'

model_8_en.fit(x=train_padded_en_indv,
               y=y_train_en_indv,               
               batch_size=128,
               validation_data=(valid_padded_en_indv, y_valid_en_indv),
               shuffle=True,
               verbose=1,                
               epochs=epochs,
               callbacks=[configure_callbacks(model_id), scheduler_lr])

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.58025, saving model to models/ap-model_8_en-0001-0.580250.h5

Epoch 00001: saving model to models/ap-model_8_en.h5
Epoch 2/100
Epoch 00002: val_accuracy did not improve from 0.58025

Epoch 00002: saving model to models/ap-model_8_en.h5
Epoch 3/100
Epoch 00003: val_accuracy improved from 0.58025 to 0.58300, saving model to models/ap-model_8_en-0003-0.583000.h5

Epoch 00003: saving model to models/ap-model_8_en.h5
Epoch 4/100
Epoch 00004: val_accuracy did not improve from 0.58300

Epoch 00004: saving model to models/ap-model_8_en.h5
Epoch 5/100
Epoch 00005: val_accuracy did not improve from 0.58300

Epoch 00005: saving model to models/ap-model_8_en.h5
Epoch 6/100
Epoch 00006: val_accuracy did not improve from 0.58300

Epoch 00006: saving model to models/ap-model_8_en.h5
Epoch 7/100
Epoch 00007: val_accuracy did not improve from 0.58300

Epoch 00007: saving model to models/ap-model_8_en.h5
Epoch 8/100
Epoch 00008: val_accuracy 

Epoch 24/100
Epoch 00024: val_accuracy did not improve from 0.58300

Epoch 00024: saving model to models/ap-model_8_en.h5
Epoch 25/100
Epoch 00025: val_accuracy did not improve from 0.58300

Epoch 00025: saving model to models/ap-model_8_en.h5
Epoch 26/100
Epoch 00026: val_accuracy did not improve from 0.58300

Epoch 00026: saving model to models/ap-model_8_en.h5
Epoch 27/100
Epoch 00027: val_accuracy did not improve from 0.58300

Epoch 00027: saving model to models/ap-model_8_en.h5
Epoch 28/100
Epoch 00028: val_accuracy did not improve from 0.58300

Epoch 00028: saving model to models/ap-model_8_en.h5
Epoch 29/100
Epoch 00029: val_accuracy did not improve from 0.58300

Epoch 00029: saving model to models/ap-model_8_en.h5
Epoch 30/100
Epoch 00030: val_accuracy did not improve from 0.58300

Epoch 00030: saving model to models/ap-model_8_en.h5
Epoch 31/100
Epoch 00031: val_accuracy did not improve from 0.58300

Epoch 00031: saving model to models/ap-model_8_en.h5
Epoch 32/100
Epoch 00032

Epoch 00048: val_accuracy did not improve from 0.58300

Epoch 00048: saving model to models/ap-model_8_en.h5
Epoch 49/100
Epoch 00049: val_accuracy did not improve from 0.58300

Epoch 00049: saving model to models/ap-model_8_en.h5
Epoch 50/100
Epoch 00050: val_accuracy did not improve from 0.58300

Epoch 00050: saving model to models/ap-model_8_en.h5
Epoch 51/100
Epoch 00051: val_accuracy did not improve from 0.58300

Epoch 00051: saving model to models/ap-model_8_en.h5
Epoch 52/100
Epoch 00052: val_accuracy did not improve from 0.58300

Epoch 00052: saving model to models/ap-model_8_en.h5
Epoch 53/100
Epoch 00053: val_accuracy did not improve from 0.58300

Epoch 00053: saving model to models/ap-model_8_en.h5
Epoch 54/100
Epoch 00054: val_accuracy did not improve from 0.58300

Epoch 00054: saving model to models/ap-model_8_en.h5
Epoch 55/100
Epoch 00055: val_accuracy did not improve from 0.58300

Epoch 00055: saving model to models/ap-model_8_en.h5
Epoch 56/100
Epoch 00056: val_accurac

Epoch 00096: val_accuracy did not improve from 0.58300

Epoch 00096: saving model to models/ap-model_8_en.h5
Epoch 97/100
Epoch 00097: val_accuracy did not improve from 0.58300

Epoch 00097: saving model to models/ap-model_8_en.h5
Epoch 98/100
Epoch 00098: val_accuracy did not improve from 0.58300

Epoch 00098: saving model to models/ap-model_8_en.h5
Epoch 99/100
Epoch 00099: val_accuracy did not improve from 0.58300

Epoch 00099: saving model to models/ap-model_8_en.h5
Epoch 100/100
Epoch 00100: val_accuracy did not improve from 0.58300

Epoch 00100: saving model to models/ap-model_8_en.h5


<tensorflow.python.keras.callbacks.History at 0x7f4a0c479588>

In [14]:
model_8_en = load_model('models/ap-model_8_en-0003-0.583000.h5')
predictions = model_8_en.predict(valid_padded_en_indv)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.83      0.75      0.79        20
        hate       0.77      0.85      0.81        20

    accuracy                           0.80        40
   macro avg       0.80      0.80      0.80        40
weighted avg       0.80      0.80      0.80        40



##### ES 20 JOINED TWEETS - TRAINABLE EMB COMPLEXER NET

In [None]:
model_10_es = model10
model_id = 'model_10_es'

model_10_es.fit(x=train_padded_es_20,
                y=y_train_es_20,
                batch_size=128,
                validation_data=(valid_padded_es_20, y_valid_es_20),
                shuffle=True,
                verbose=1,                
                epochs=epochs,
                callbacks=[configure_callbacks(model_id), scheduler_lr])

In [15]:
model_10_es = load_model('models/ap-model_10_es-0002-0.727500.h5')
predictions = model_10_es.predict(valid_padded_es_20)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.67      1.00      0.80        20
        hate       1.00      0.50      0.67        20

    accuracy                           0.75        40
   macro avg       0.83      0.75      0.73        40
weighted avg       0.83      0.75      0.73        40



##### EN 20 JOINED TWEETS - TRAINABLE EMB COMPLEXER NET

In [None]:
model_10_en = model10
model_id = 'model_10_en'

model_10_en.fit(x=train_padded_en_20,
                y=y_train_en_20,
                batch_size=128
                validation_data=(valid_padded_en_20, y_valid_en_20),
                shuffle=True,
                verbose=1,                
                epochs=epochs,
                callbacks=[configure_callbacks(model_id), scheduler])

In [16]:
model_10_en = load_model('models/ap-model_10_en-0078-0.660000.h5')
predictions = model_10_en.predict(valid_padded_en_20)
author_profiling_report(predictions)

              precision    recall  f1-score   support

    not hate       0.65      1.00      0.78        20
        hate       1.00      0.45      0.62        20

    accuracy                           0.73        40
   macro avg       0.82      0.72      0.70        40
weighted avg       0.82      0.72      0.70        40



### Test predictions

In [11]:
# Logistic Regression

pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(3,6),
                                 max_df=0.9,
                                 min_df=1,
                                 analyzer="char_wb")),
        ("clf", LogisticRegression(C=1))
])

pipe.fit(x_train_es, y_train_es)
predictions = pipe.predict(x_test_es)
author_profiling_report(predictions, number_authors=100)

pipe.fit(x_train_en, y_train_en)
predictions = pipe.predict(x_test_en)
author_profiling_report(predictions, number_authors=100)

              precision    recall  f1-score   support

    not hate       0.43      0.38      0.40        50
        hate       0.45      0.50      0.47        50

    accuracy                           0.44       100
   macro avg       0.44      0.44      0.44       100
weighted avg       0.44      0.44      0.44       100

              precision    recall  f1-score   support

    not hate       0.43      0.38      0.40        50
        hate       0.45      0.50      0.47        50

    accuracy                           0.44       100
   macro avg       0.44      0.44      0.44       100
weighted avg       0.44      0.44      0.44       100



array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [12]:
# Model 5 indv
model_5_es = load_model('models/ap-model_5_es-0008-0.732500.h5')
predictions = model_5_es.predict(test_padded_es_20)
author_profiling_report(predictions, number_authors=100)

model_5_en = load_model('models/ap-model_5_en-0002-0.687500.h5')
predictions = model_5_en.predict(test_padded_en_20)
author_profiling_report(predictions, number_authors=100)

              precision    recall  f1-score   support

    not hate       0.50      0.16      0.24        50
        hate       0.50      0.84      0.63        50

    accuracy                           0.50       100
   macro avg       0.50      0.50      0.43       100
weighted avg       0.50      0.50      0.43       100

              precision    recall  f1-score   support

    not hate       0.55      0.48      0.51        50
        hate       0.54      0.60      0.57        50

    accuracy                           0.54       100
   macro avg       0.54      0.54      0.54       100
weighted avg       0.54      0.54      0.54       100



array([0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1])

In [13]:
# Model 8 20 joined
model_8_es = load_model('models/ap-model_8_es-0010-0.613875.h5')
predictions = model_8_es.predict(test_padded_en_indv)
author_profiling_report(predictions, number_authors=100)

model_8_en = load_model('models/ap-model_8_en-0003-0.583000.h5')
predictions = model_8_en.predict(test_padded_en_indv)
author_profiling_report(predictions, number_authors=100)

              precision    recall  f1-score   support

    not hate       0.51      0.86      0.64        50
        hate       0.56      0.18      0.27        50

    accuracy                           0.52       100
   macro avg       0.54      0.52      0.46       100
weighted avg       0.54      0.52      0.46       100

              precision    recall  f1-score   support

    not hate       0.48      0.66      0.55        50
        hate       0.45      0.28      0.35        50

    accuracy                           0.47       100
   macro avg       0.46      0.47      0.45       100
weighted avg       0.46      0.47      0.45       100



array([1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [14]:
pickle_file = open('test_es_20.pickle', 'rb')
test_es_20 = pickle.load(pickle_file)

test_id_es, x_test_es = test_es_20[0], test_es_20[1]

pickle_file = open('test_en_20.pickle', 'rb')
test_en_20 = pickle.load(pickle_file)

test_id_en, x_test_en = test_en_20[0], test_en_20[1]

In [15]:
# Model 5 indv
model_5_es = load_model('models/ap-model_5_es-0008-0.732500.h5')
predictions_es = model_5_es.predict(x_test_es)

model_5_en = load_model('models/ap-model_5_en-0002-0.687500.h5')
predictions_en = model_5_en.predict(x_test_en)

In [16]:
len(x_test_en), len(predictions_es)

(1000, 1000)

In [17]:
predictions_es = author_profiling_report(predictions_es, number_authors=100)
predictions_en = author_profiling_report(predictions_en, number_authors=100)

              precision    recall  f1-score   support

    not hate       0.50      0.16      0.24        50
        hate       0.50      0.84      0.63        50

    accuracy                           0.50       100
   macro avg       0.50      0.50      0.43       100
weighted avg       0.50      0.50      0.43       100

              precision    recall  f1-score   support

    not hate       0.55      0.48      0.51        50
        hate       0.54      0.60      0.57        50

    accuracy                           0.54       100
   macro avg       0.54      0.54      0.54       100
weighted avg       0.54      0.54      0.54       100



In [21]:
aux = []
for i in range(0, len(test_id_es), 10):
    aux.append(test_id_es[i])
test_id_es = aux
len(test_id_es), len(predictions_es)

(100, 100)

In [23]:
aux = []
for i in range(0, len(test_id_en), 10):
    aux.append(test_id_en[i])
test_id_en = aux
len(test_id_en), len(predictions_en)

(100, 100)

In [18]:
# 100 authors 100 tweets in test
# txt output
# <author id="author-id" lang="en|es" type="0|1"/>
def format_output(author_predictions, authors, lang):
    for i in range(len(author_predictions)):
        root = ET.Element("author")
        root.set("id", str(authors[i]))
        root.set("lang", lang)
        root.set("type", str(author_predictions[i]))

        tree = ET.ElementTree(root)
        xml_str = ET.tostring(root).decode('utf8')

        save_path_file = f'results/{lang}/{authors[i]}.xml'

        with open(save_path_file, "w") as f:
            f.write(xml_str)

In [24]:
format_output(predictions_es, test_id_es, "es")
format_output(predictions_en, test_id_en, "en")