In [1]:
import numpy as np
import os
import pickle

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation, Concatenate
from keras.layers import Embedding, LSTM, Bidirectional, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint, CSVLogger
from keras.callbacks import LearningRateScheduler as LRS
from keras.models import load_model
from keras.utils import plot_model

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import string
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from transformers import TFBertForSequenceClassification

In [2]:
!mkdir models
kaggle_path_pickle = '/kaggle/input/bert-preprocesed-author-profiling/'
kaggle_path_models = '/kaggle/working/'

In [3]:
def configure_callbacks(model_id, kpi_to_monitor='val_accuracy'):
    # Without log/ or models/ subfolder as not possible to access unexisting folders
    # If possible to train with jupyter revise
    path = kaggle_path_models + "models"
    name = "ap"
    
    log_filename = '%s/%s-%s.log' % (path,name, model_id)
    csv_logger = CSVLogger(log_filename)
    
    chk_1_model_filename = '%s/%s-%s-{epoch:04d}-{%s:.6f}.h5' % (path, name, model_id, kpi_to_monitor)
    chk_2_model_filename = '%s/%s-%s.h5' % (path, name, model_id)
    
    # Save best model fully not only weights after each epoch (period=1) 
    # with best accuracy value (mode=max, save_best_only=True)
    checkpoint1 = ModelCheckpoint(
        chk_1_model_filename,
        monitor=kpi_to_monitor,
        save_best_only=True,
        save_weights_only=False, 
        verbose=1, mode='max', period=1
    )

    checkpoint2 = ModelCheckpoint(
        chk_2_model_filename, 
        monitor=kpi_to_monitor,
        save_best_only=False,
        save_weights_only=False, 
        verbose=1, mode='auto', period=1
    )

    callbacks = [csv_logger, checkpoint1, checkpoint2]
    return callbacks

In [None]:
######### AUTHOR PROFILING FUNCTION FOR JOINING PREDICTIONS #########

# de 20 0s y 20 1s

def author_profiling_report(author_profile, number_authors=40):
    n = int(number_authors/2)
    a = np.zeros(n)
    b = np.ones(n)        
    author_profile = np.concatenate([a,b])
    
    # Check author profiling -> 8,000 predictions
    # Split into 40 authors -> 200 tweets per author
    author_predictions = np.average(np.array_split(predictions, number_authors), axis=1)
    author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

    print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

In [4]:
pickle_file = open(kaggle_path_pickle+'es_20.pickle', 'rb')
es_20 = pickle.load(pickle_file)

train_padded_es_20, y_train_es_20 = es_20[0], es_20[1]
valid_padded_es_20, y_valid_es_20 = es_20[2], es_20[3]
test_padded_es_20 = es_20[4]

pickle_file = open(kaggle_path_pickle+'en_20.pickle', 'rb')
en_20 = pickle.load(pickle_file)

train_padded_en_20, y_train_en_20 = en_20[0], en_20[1]
valid_padded_en_20, y_valid_en_20 = en_20[2], en_20[3]
test_padded_en_20 = en_20[4]

# Define a learning rate scheduler
def scheduler(epoch):
    if epoch < 25:
        return 0.001
    elif epoch < 50:
        return 0.0005
    elif epoch < 75:
        return 0.0001
    else:
        return 0.00005

scheduler_lr = LRS(scheduler)
# Optimizer
adam = Adam(learning_rate=0.001)
epochs = 100

In [5]:
# ES 20 JOINED TWEETS - TRAINABLE EMB COMPLEXER NET
max_words = 10000
word_embedding_size = 300
max_length = 250

model10 = Sequential()
model10.add(Embedding(max_words,
                    output_dim=word_embedding_size,
                    #weights=[embedding_matrix_en_indv], # embeddings
                    input_length=max_length,
                    trainable=True, # Changed to True
                    mask_zero=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512, return_sequences=True))
model10.add(LSTM(units=512))
model10.add(Dense(1, activation='sigmoid'))

model10.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model10.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 300)          3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 250, 512)          1665024   
_________________________________________________________________
lstm_1 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_2 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_3 (LSTM)                (None, 250, 512)          2099200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense (Dense)                (None, 1)                 5

In [6]:
model_10_es = model10
model_id = 'model_10_es'

model_10_es.fit(x=train_padded_es_20,
                y=y_train_es_20,
                batch_size=128,
                validation_data=(valid_padded_es_20, y_valid_es_20),
                shuffle=True,
                verbose=1,                
                epochs=epochs,
                callbacks=[configure_callbacks(model_id), scheduler_lr])

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.68750, saving model to /kaggle/working/models/ap-model_10_es-0001-0.687500.h5

Epoch 00001: saving model to /kaggle/working/models/ap-model_10_es.h5
Epoch 2/100

Epoch 00002: val_accuracy improved from 0.68750 to 0.72500, saving model to /kaggle/working/models/ap-model_10_es-0002-0.725000.h5

Epoch 00002: saving model to /kaggle/working/models/ap-model_10_es.h5
Epoch 3/100

Epoch 00003: val_accuracy did not improve from 0.72500

Epoch 00003: saving model to /kaggle/working/models/ap-model_10_es.h5
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.72500

Epoch 00004: saving model to /kaggle/working/models/ap-model_10_es.h5
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.72500

Epoch 00005: saving model to /kaggle/working/models/ap-model_10_es.h5
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.72500

Epoch 00006: saving model to /kaggle/working/models/ap-model_10_es.h5
Epoch 7/100

Epoc

<tensorflow.python.keras.callbacks.History at 0x7fdfe07bf750>

In [7]:
model_10_en = model10
model_id = 'model_10_en'

model_10_en.fit(x=train_padded_en_20,
                y=y_train_en_20,
                batch_size=128,
                validation_data=(valid_padded_en_20, y_valid_en_20),
                shuffle=True,
                verbose=1,                
                epochs=epochs,
                callbacks=[configure_callbacks(model_id), scheduler_lr])

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.65000, saving model to /kaggle/working/models/ap-model_10_en-0001-0.650000.h5

Epoch 00001: saving model to /kaggle/working/models/ap-model_10_en.h5
Epoch 2/100

Epoch 00002: val_accuracy improved from 0.65000 to 0.67500, saving model to /kaggle/working/models/ap-model_10_en-0002-0.675000.h5

Epoch 00002: saving model to /kaggle/working/models/ap-model_10_en.h5
Epoch 3/100

Epoch 00003: val_accuracy did not improve from 0.67500

Epoch 00003: saving model to /kaggle/working/models/ap-model_10_en.h5
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.67500

Epoch 00004: saving model to /kaggle/working/models/ap-model_10_en.h5
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.67500

Epoch 00005: saving model to /kaggle/working/models/ap-model_10_en.h5
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.67500

Epoch 00006: saving model to /kaggle/working/models/ap-model_10_en.h5
Epoch 7/100

Epoc

<tensorflow.python.keras.callbacks.History at 0x7fdb8e26dc10>

In [8]:
!zip -r models_kaggle.zip '/kaggle/working/models/'

  adding: kaggle/working/models/ (stored 0%)
  adding: kaggle/working/models/ap-model_10_es-0002-0.725000.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_en-0002-0.675000.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_es.log (deflated 61%)
  adding: kaggle/working/models/ap-model_10_en-0008-0.690000.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_es-0001-0.687500.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_es.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_en.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_en-0001-0.650000.h5 (deflated 6%)
  adding: kaggle/working/models/ap-model_10_en.log (deflated 65%)
  adding: kaggle/working/models/ap-model_10_en-0007-0.680000.h5 (deflated 6%)


In [None]:
pickle_file = open(kaggle_path+'es_indv_bert.pickle', 'rb')
es_indv_bert = pickle.load(pickle_file)

train_padded_es_indv_bert, train_mask_es_indv, y_train_es_indv = es_indv_bert[0], es_indv_bert[1], es_indv_bert[2]
valid_padded_es_indv_bert, valid_mask_es_indv, y_valid_es_indv = es_indv_bert[3], es_indv_bert[4], es_indv_bert[5]
test_padded_es_indv_bert, test_mask_es_indv = es_indv_bert[6], es_indv_bert[7]

In [None]:
pickle_file = open(kaggle_path+'es_20_bert.pickle', 'rb')
es_20_bert = pickle.load(pickle_file)

train_padded_es_20_bert, train_mask_es_20, y_train_es_20 = es_20_bert[0], es_20_bert[1], es_20_bert[2]
valid_padded_es_20_bert, valid_mask_es_20, y_valid_es_20 = es_20_bert[3], es_20_bert[4], es_20_bert[5]
test_padded_es_20_bert, test_mask_es_20 = es_20_bert[6], es_20_bert[7]

In [None]:
#  ES individual tweets

bert_model1 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model1.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=2e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model1.summary()

In [None]:
bert_model1.fit(x=[train_padded_es_indv_bert, train_mask_es_indv],
               y=y_train_es_indv,
               batch_size=128,
               epochs=25,
               validation_data=([valid_padded_es_indv_bert, valid_mask_es_indv], y_valid_es_indv))

In [None]:
predictions1 = bert_model1.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)
#author_profiling_report(predictions)

In [None]:
number_authors = 40
n = int(number_authors/2)
a = np.zeros(n)
b = np.ones(n)        
author_profile = np.concatenate([a,b])

p = []
for logit in predictions1.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

In [None]:
#  ES individual tweets

bert_model2 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model2.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=6e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model2.summary()

In [None]:
bert_model2.fit(x=[train_padded_es_indv_bert, train_mask_es_indv],
               y=y_train_es_indv,
               batch_size=128,
               epochs=25,
               validation_data=([valid_padded_es_indv_bert, valid_mask_es_indv], y_valid_es_indv))

predictions = bert_model2.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)

In [None]:
predictions2 = bert_model2.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)
p = []
for logit in predictions2.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

In [None]:
#  ES individual tweets

bert_model3 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model3.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=1e-4,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model3.summary()

In [None]:
bert_model3.fit(x=[train_padded_es_indv_bert, train_mask_es_indv],
               y=y_train_es_indv,
               batch_size=128,
               epochs=25,
               validation_data=([valid_padded_es_indv_bert, valid_mask_es_indv], y_valid_es_indv))

predictions = bert_model3.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)

In [None]:
predictions3 = bert_model3.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)
p = []
for logit in predictions3.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

In [None]:
# ES joined 20 tweets

bert_model4 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model4.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=2e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model4.summary()

In [None]:
bert_model4.fit(x=[train_padded_es_20_bert, train_mask_es_20],
               y=y_train_es_20,
               batch_size=16,
               epochs=5,
               validation_data=([valid_padded_es_20_bert, valid_mask_es_20], y_valid_es_20))

In [None]:
predictions4= bert_model4.predict([test_padded_es_20_bert, test_mask_es_20], batch_size=16)
p = []
for logit in predictions4.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))