In [71]:
import math
import numpy as np
import os
import pickle

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation, Concatenate
from keras.layers import Embedding, LSTM, Bidirectional, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint, CSVLogger
from keras.models import load_model
from keras.utils import plot_model

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import string
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from transformers import TFBertForSequenceClassification

In [4]:
######### AUTHOR PROFILING FUNCTION FOR JOINING PREDICTIONS #########

# de 20 0s y 20 1s

def author_profiling_report(author_profile, number_authors=40):
    n = int(number_authors/2)
    a = np.zeros(n)
    b = np.ones(n)        
    author_profile = np.concatenate([a,b])
    
    # Check author profiling -> 8,000 predictions
    # Split into 40 authors -> 200 tweets per author
    author_predictions = np.average(np.array_split(predictions, number_authors), axis=1)
    author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

    print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

In [5]:
kaggle_path = '/kaggle/input/bert-preprocesed-author-profiling/'

In [6]:
pickle_file = open(kaggle_path+'es_indv_bert.pickle', 'rb')
es_indv_bert = pickle.load(pickle_file)

train_padded_es_indv_bert, train_mask_es_indv, y_train_es_indv = es_indv_bert[0], es_indv_bert[1], es_indv_bert[2]
valid_padded_es_indv_bert, valid_mask_es_indv, y_valid_es_indv = es_indv_bert[3], es_indv_bert[4], es_indv_bert[5]
test_padded_es_indv_bert, test_mask_es_indv = es_indv_bert[6], es_indv_bert[7]

In [83]:
pickle_file = open(kaggle_path+'es_20_bert.pickle', 'rb')
es_20_bert = pickle.load(pickle_file)

train_padded_es_20_bert, train_mask_es_20, y_train_es_20 = es_20_bert[0], es_20_bert[1], es_20_bert[2]
valid_padded_es_20_bert, valid_mask_es_20, y_valid_es_20 = es_20_bert[3], es_20_bert[4], es_20_bert[5]
test_padded_es_20_bert, test_mask_es_20 = es_20_bert[6], es_20_bert[7]

In [17]:
#  ES individual tweets

bert_model1 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model1.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=2e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model1.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_189 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  769       
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [18]:
bert_model1.fit(x=[train_padded_es_indv_bert, train_mask_es_indv],
               y=y_train_es_indv,
               batch_size=128,
               epochs=25,
               validation_data=([valid_padded_es_indv_bert, valid_mask_es_indv], y_valid_es_indv))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [72]:
predictions1 = bert_model1.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)
#author_profiling_report(predictions)

In [74]:
number_authors = 40
n = int(number_authors/2)
a = np.zeros(n)
b = np.ones(n)        
author_profile = np.concatenate([a,b])

p = []
for logit in predictions1.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

              precision    recall  f1-score   support

    not hate       1.00      0.10      0.18        20
        hate       0.53      1.00      0.69        20

    accuracy                           0.55        40
   macro avg       0.76      0.55      0.44        40
weighted avg       0.76      0.55      0.44        40



In [19]:
#  ES individual tweets

bert_model2 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model2.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=6e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model2.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_227 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  769       
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [20]:
bert_model2.fit(x=[train_padded_es_indv_bert, train_mask_es_indv],
               y=y_train_es_indv,
               batch_size=128,
               epochs=25,
               validation_data=([valid_padded_es_indv_bert, valid_mask_es_indv], y_valid_es_indv))

predictions = bert_model2.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [77]:
predictions2 = bert_model2.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)
p = []
for logit in predictions2.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

              precision    recall  f1-score   support

    not hate       0.00      0.00      0.00        20
        hate       0.50      1.00      0.67        20

    accuracy                           0.50        40
   macro avg       0.25      0.50      0.33        40
weighted avg       0.25      0.50      0.33        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
#  ES individual tweets

bert_model3 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model3.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=1e-4,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model3.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_265 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  769       
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [22]:
bert_model3.fit(x=[train_padded_es_indv_bert, train_mask_es_indv],
               y=y_train_es_indv,
               batch_size=128,
               epochs=25,
               validation_data=([valid_padded_es_indv_bert, valid_mask_es_indv], y_valid_es_indv))

predictions = bert_model3.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [79]:
predictions3 = bert_model3.predict([valid_padded_es_indv_bert, valid_mask_es_indv], batch_size=128)
p = []
for logit in predictions3.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

              precision    recall  f1-score   support

    not hate       0.00      0.00      0.00        20
        hate       0.50      1.00      0.67        20

    accuracy                           0.50        40
   macro avg       0.25      0.50      0.33        40
weighted avg       0.25      0.50      0.33        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
# ES joined 20 tweets

bert_model4 = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

bert_model4.compile(loss='binary_crossentropy',
                   optimizer=Adam(learning_rate=2e-5,epsilon=1e-08),
                   metrics=['accuracy'])
bert_model4.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_303 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  769       
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [88]:
bert_model4.fit(x=[train_padded_es_20_bert, train_mask_es_20],
               y=y_train_es_20,
               batch_size=16,
               epochs=5,
               validation_data=([valid_padded_es_20_bert, valid_mask_es_20], y_valid_es_20))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


AxisError: axis1: axis 0 is out of bounds for array of dimension 0

In [90]:
predictions4= bert_model4.predict([test_padded_es_20_bert, test_mask_es_20], batch_size=16)
p = []
for logit in predictions4.logits:
    p.append(math.exp(logit)/(1+math.exp(logit)))
np.array(p)

# Check author profiling -> 8,000 predictions
# Split into 40 authors -> 200 tweets per author
author_predictions = np.average(np.array_split(p, number_authors), axis=1)
author_predictions = np.array([1 if ap >= 0.5 else 0 for ap in author_predictions])

print(classification_report(author_profile, author_predictions, labels=[0, 1], target_names=['not hate','hate']))

              precision    recall  f1-score   support

    not hate       0.00      0.00      0.00        20
        hate       0.50      1.00      0.67        20

    accuracy                           0.50        40
   macro avg       0.25      0.50      0.33        40
weighted avg       0.25      0.50      0.33        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
