# Import Libs

In [1]:
import pandas as pd
import numpy as np
from statistics import mean
from tqdm import tqdm
import matplotlib.pyplot as plt

from nltk import word_tokenize
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer


from sklearn.model_selection import StratifiedShuffleSplit

import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, GlobalMaxPooling1D, Activation
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier

from statistics import mean
from sklearn.metrics import classification_report, f1_score, accuracy_score

# Load dataset

In [2]:
# pre_processed_text = 'gemini_embedding'
# pre_processed_text = 'text_embed'
pre_processed_text = 'pre_processed_text'

In [3]:
df = pd.read_csv('./dataset/hsd_pre_processed.csv')

In [4]:
df

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha
...,...,...,...,...,...,...,...,...,...
5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...
5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra
5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher
5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...


# Word Embedding

## GloVe

In [5]:
GLOVE_MODEL_FILE = './dataset/glove.twitter.27B/glove.twitter.27B.100d.txt'
max_len = 128
embedding_dim = 100

# Tokenize
token = Tokenizer()
token.fit_on_texts(df['pre_processed_text'])
seq = token.texts_to_sequences(df['pre_processed_text'])

# Padding
pad_seq = pad_sequences(seq,maxlen=embedding_dim)

# Vocab size
vocab_size = len(token.word_index)+1

# Load embedding vector
embedding_vector = {}
f = open(GLOVE_MODEL_FILE)
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

1193514it [00:11, 103440.67it/s]


In [6]:
# Keep a out of vocabullary dict
oov_dict = {}

# Generate embedding matrix
embedding_matrix = np.zeros((vocab_size,embedding_dim))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value
    else:
        oov_dict[word] = np.random.uniform(-1., 1., (embedding_dim,)) # Generate new random vector
        embedding_matrix[i] = oov_dict[word]


# Transform text into embed vector
embedded_sequences = np.zeros((len(pad_seq), max_len, embedding_dim))
for i, seq in enumerate(pad_seq):
    for j, idx in enumerate(seq):
        if idx > 0:  # Skip padding index
            embedded_sequences[i, j] = embedding_matrix[idx]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:00<00:00, 638534.22it/s]


## Bag of Words

In [7]:
def bag_of_words(X_train, X_test, n_grams):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams))
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    return X_train,X_test

## CHI-2

In [8]:
X = df['pre_processed_text']
y = df['hatespeech_comb']


def feature_selection_chi2(X,y):
  normalizer = MinMaxScaler()
  X_norm = normalizer.fit_transform(X)
  chi_selector = SelectKBest(chi2, k=241)
  chi_selector.fit(X_norm, y)

  chi_support = chi_selector.get_support()
  selected_features = np.where(chi_support)[0]
  #chi_feature = X.loc[:,chi_support].columns.tolist()
  #print(str(len(selected_features)), 'selected features')
  return selected_features


# Split into training and test sets

In [9]:
# X = df[pre_processed_text]
X = embedded_sequences
y = df['hatespeech_comb']

RANDOM_STATE = 42

# Hold out
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = RANDOM_STATE)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Training

## CNN

In [10]:
activation_func = 'relu'
epochs = 10
batch_size = 128
learning_rate = 0.001

filters = 100 
kernel_size = 1

def cnn():
    model = Sequential()
    
    # Input Layer (adjust input shape based on your data)
    model.add(InputLayer(shape=(max_len, embedding_dim)))
    
    # Convolutional and Pooling Layers
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=activation_func))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    
    # Fully Connected Layers
    model.add(Flatten())
    model.add(Dense(units=1000, activation=activation_func))
    model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='sigmoid'))
    
    # Compile the Model
    adam = Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['binary_accuracy'])
    model.summary()
    return model

In [11]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, train_size=0.8, random_state=42)

results = []
f1 = []
accuracy = []

# X = df[pre_processed_text]
X = embedded_sequences
y = df['hatespeech_comb']
n_gram = 2

print("# Training")

for i, (train_index_cv, val_index) in enumerate(sss.split(X_train, y_train)):
    print(f"Folder :{i}")
    X_train_cv, X_val = X[train_index_cv], X[val_index]
    y_train_cv, y_val = y[train_index_cv], y[val_index]

    
    # Define EarlyStopping callback
    early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

    # Model
    model = KerasClassifier(model=cnn,
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=[early_stopping])

    # Fit
    model.fit(X_train_cv, y_train_cv)
    pred = model.predict(X_val)

    result = classification_report(y_val, pred)
    results.append(result)

    f = f1_score(y_val, pred)
    f1.append(f)
    print(f"# F1: {f}")

    acc = accuracy_score(y_val, pred)
    accuracy.append(acc)
    print(f"# Accuracy: {acc}")
    print("===============")

print("# Mean Accuracy: ", mean(accuracy))
print("# Mean F1: ", mean(f1))

# Training
Folder :0


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6189 - loss: 0.6852
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6782 - loss: 0.6248
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6877 - loss: 0.6086
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6926 - loss: 0.5959
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7344 - loss: 0.5618
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7286 - loss: 0.5501
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7303 - loss: 0.5519
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7266 - loss: 0.5442


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - binary_accuracy: 0.6522 - loss: 0.6684
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6789 - loss: 0.6175
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6862 - loss: 0.6072
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7120 - loss: 0.5979
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7154 - loss: 0.5714
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7243 - loss: 0.5523
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7389 - loss: 0.5514
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7239 - loss: 0.5444


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6545 - loss: 0.6526
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6929 - loss: 0.6206
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7234 - loss: 0.5809
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7135 - loss: 0.5726
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7406 - loss: 0.5460
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7347 - loss: 0.5413
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7430 - loss: 0.5352
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7707 - loss: 0.4923


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6417 - loss: 0.6735
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6772 - loss: 0.6187
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6940 - loss: 0.6036
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7170 - loss: 0.5761
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7101 - loss: 0.5806
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7253 - loss: 0.5474
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7354 - loss: 0.5410
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7164 - loss: 0.5535


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6475 - loss: 0.6726
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6690 - loss: 0.6278
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7119 - loss: 0.5920
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7276 - loss: 0.5636
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7202 - loss: 0.5759
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7353 - loss: 0.5494
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7414 - loss: 0.5364
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7420 - loss: 0.5381


# Evaluation

In [12]:
# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

model = KerasClassifier(model = cnn,
                        epochs = epochs,
                        batch_size = batch_size,
                        callbacks=[early_stopping])

model.fit(X_train, y_train)
pred = model.predict(X_test)

result = classification_report(y_test, pred)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(result)

Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - binary_accuracy: 0.6378 - loss: 0.6755
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.6845 - loss: 0.6226
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - binary_accuracy: 0.7060 - loss: 0.5923
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7182 - loss: 0.5800
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7258 - loss: 0.5582
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7421 - loss: 0.5370
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7381 - loss: 0.5418
Epoch 8/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - binary_accuracy: 0.7458 - loss: 0.5341
