# Import libs

In [2]:
import pandas as pd
import numpy as np
from statistics import mean
from tqdm import tqdm
import matplotlib.pyplot as plt

from nltk import word_tokenize
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, GlobalMaxPooling1D, Activation, LSTM
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier

from statistics import mean
from sklearn.metrics import classification_report, f1_score, accuracy_score

# Load dataset

In [3]:
# pre_processed_text = 'gemini_embedding'
# pre_processed_text = 'text_embed'
pre_processed_text = 'pre_processed_text'

In [4]:
df = pd.read_csv('./dataset/hsd_pre_processed.csv')

In [5]:
df

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha
...,...,...,...,...,...,...,...,...,...
5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...
5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra
5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher
5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...


# Word Embedding

## GloVe

In [6]:
GLOVE_MODEL_FILE = './dataset/glove.twitter.27B/glove.twitter.27B.100d.txt'
max_len = 128
embedding_dim = 100

# Tokenize
token = Tokenizer()
token.fit_on_texts(df['pre_processed_text'])
seq = token.texts_to_sequences(df['pre_processed_text'])

# Padding
pad_seq = pad_sequences(seq,maxlen=embedding_dim)

# Vocab size
vocab_size = len(token.word_index)+1

# Load embedding vector
embedding_vector = {}
f = open(GLOVE_MODEL_FILE)
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

1193514it [00:11, 102395.03it/s]


In [7]:
# Keep a out of vocabullary dict
oov_dict = {}

# Generate embedding matrix
embedding_matrix = np.zeros((vocab_size,embedding_dim))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value
    else:
        oov_dict[word] = np.random.uniform(-1., 1., (embedding_dim,)) # Generate new random vector
        embedding_matrix[i] = oov_dict[word]


# Transform text into embed vector
embedded_sequences = np.zeros((len(pad_seq), max_len, embedding_dim))
for i, seq in enumerate(pad_seq):
    for j, idx in enumerate(seq):
        if idx > 0:  # Skip padding index
            embedded_sequences[i, j] = embedding_matrix[idx]

100%|█████████████████████████████████| 15340/15340 [00:00<00:00, 594277.33it/s]


# Split into training and tests sets

In [73]:
# X = df[pre_processed_text]
X = embedded_sequences
y = df['hatespeech_comb']


RANDOM_STATE = 42

# Hold out
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = RANDOM_STATE)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Training

## MLP

In [80]:
activation_func = 'relu'
epochs = 10
batch_size = 128
learning_rate = 0.001

def mlp(learning_rate=0.001, activation_func='relu', clipvalue=0.5):
    model = Sequential()
    model.add(InputLayer(shape=(max_len*embedding_dim,)))
    model.add(Dense(units = 1000, activation = activation_func, kernel_initializer = 'random_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(units = 500, activation = activation_func, kernel_initializer = 'random_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    
    adam = Adam(learning_rate = learning_rate, clipvalue = clipvalue)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['binary_accuracy'])    
    return model


early_stopping_mlp = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

mlp_model = KerasClassifier(model = mlp,
                        epochs = epochs,
                        batch_size = batch_size,
                        callbacks=[early_stopping_mlp])

## CNN

In [81]:
activation_func = 'relu'
epochs = 10
batch_size = 128
learning_rate = 0.001

filters = 100 
kernel_size = 1

def cnn():
    model = Sequential()
    
    # Input Layer (adjust input shape based on your data)
    model.add(InputLayer(shape=(max_len,embedding_dim)))
    
    # Convolutional and Pooling Layers
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=activation_func))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    
    # Fully Connected Layers
    model.add(Flatten())
    model.add(Dense(units=1000, activation=activation_func))
    model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='sigmoid'))
    
    # Compile the Model
    adam = Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['binary_accuracy'])
    model.summary()
    return model


# Define EarlyStopping callback
early_stopping_cnn = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

# Model
cnn_model = KerasClassifier(model=cnn,
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[early_stopping_cnn])

## LSTM

In [82]:
epochs = 10
batch_size = 128
learning_rate=0.001

def lstm_model():
    model = Sequential()
    model.add(InputLayer(shape=(max_len,embedding_dim)))
    model.add(Dropout(0.2))
    model.add(LSTM(200, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1000,activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation = 'sigmoid'))
    adam = Adam(learning_rate=learning_rate, clipvalue=0.5)
    model.compile(optimizer=adam,loss='binary_crossentropy',metrics = ['binary_accuracy'])
    model.summary()
    return model


# Define EarlyStopping callback
early_stopping_lstm = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

# Model
lstm_model = KerasClassifier(model=lstm_model,
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[early_stopping_lstm])

# Voting classifier

In [77]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(estimators=[
        ('cnn', cnn_model), ('mlp', mlp_model), ('lstm', lstm_model)], voting='soft')

In [83]:
mlp_model.fit(np.array([matrix.ravel() for matrix in X_train]), y_train)
pred_mlp = mlp_model.predict(np.array([matrix.ravel() for matrix in X_test]))

cnn_model.fit(X_train, y_train)
pred_cnn = cnn_model.predict(X_test)


lstm_model.fit(X_train, y_train)
pred_lstm = lstm_model.predict(X_test)


pred = pred_mlp + pred_cnn + pred_lstm

Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - binary_accuracy: 0.6446 - loss: 0.6774
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - binary_accuracy: 0.7797 - loss: 0.4496
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - binary_accuracy: 0.9124 - loss: 0.2448
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - binary_accuracy: 0.9661 - loss: 0.1111
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - binary_accuracy: 0.9879 - loss: 0.0481
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - binary_accuracy: 0.9907 - loss: 0.0361
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - binary_accuracy: 0.9970 - loss: 0.0195
Epoch 8/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - binary_accuracy: 0.9970 - loss:

Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - binary_accuracy: 0.6518 - loss: 0.6541
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6891 - loss: 0.6064
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7186 - loss: 0.5757
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7089 - loss: 0.5817
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - binary_accuracy: 0.7106 - loss: 0.5699
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7398 - loss: 0.5495
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7448 - loss: 0.5353
Epoch 8/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7490 - loss: 0.5183


Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 193ms/step - binary_accuracy: 0.6646 - loss: 0.6611
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 188ms/step - binary_accuracy: 0.7142 - loss: 0.5831
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 186ms/step - binary_accuracy: 0.7426 - loss: 0.5402
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 182ms/step - binary_accuracy: 0.7323 - loss: 0.5268
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 188ms/step - binary_accuracy: 0.7417 - loss: 0.5309
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 187ms/step - binary_accuracy: 0.7587 - loss: 0.5011
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 185ms/step - binary_accuracy: 0.7643 - loss: 0.5040
Epoch 8/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 193ms/step - binary_accuracy: 0.7712

In [94]:
p = [1 if i > 1 else 0 for i in pred]

In [95]:
result = classification_report(y_test, p)
f1 = f1_score(y_test, p)
accuracy = accuracy_score(y_test, p)

print(result)

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       776
           1       0.61      0.62      0.62       358

    accuracy                           0.76      1134
   macro avg       0.72      0.72      0.72      1134
weighted avg       0.76      0.76      0.76      1134

