# Import Libs

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from statistics import mean

from nltk import word_tokenize
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, confusion_matrix


import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import InputLayer, Dense, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling2D, Flatten, Embedding, Activation, LSTM
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

# Load dataset

In [2]:
pre_processed_text = 'gemini_embedding'
# pre_processed_text = 'text_embed'
# pre_processed_text = 'pre_processed_text'

In [3]:
df = pd.read_csv('./dataset/hsd_pre_processed.csv')

In [4]:
df

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha
...,...,...,...,...,...,...,...,...,...
5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...
5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra
5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher
5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...


# Word Embedding

## GloVe

In [5]:
GLOVE_MODEL_FILE = './dataset/glove.twitter.27B/glove.twitter.27B.100d.txt'
max_len = 128
embedding_dim = 100

# Tokenize
token = Tokenizer()
token.fit_on_texts(df['pre_processed_text'])
seq = token.texts_to_sequences(df['pre_processed_text'])

# Padding
pad_seq = pad_sequences(seq,maxlen=embedding_dim)

# Vocab size
vocab_size = len(token.word_index)+1

# Load embedding vector
embedding_vector = {}
f = open(GLOVE_MODEL_FILE)
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

1193514it [00:15, 77301.84it/s]


In [6]:
# Keep a out of vocabullary dict
oov_dict = {}

# Generate embedding matrix
embedding_matrix = np.zeros((vocab_size,embedding_dim))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value
    else:
        oov_dict[word] = np.random.uniform(-1., 1., (embedding_dim,)) # Generate new random vector
        embedding_matrix[i] = oov_dict[word]


# Transform text into embed vector
embedded_sequences = np.zeros((len(pad_seq), max_len, embedding_dim))
for i, seq in enumerate(pad_seq):
    for j, idx in enumerate(seq):
        if idx > 0:  # Skip padding index
            embedded_sequences[i, j] = embedding_matrix[idx]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:00<00:00, 207034.15it/s]


## Bag of Words

In [7]:
def bag_of_words(X_train, X_test, n_grams):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams))
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    return X_train,X_test

## CHI-2

In [8]:
X = df['pre_processed_text']
y = df['hatespeech_comb']


def feature_selection_chi2(X,y):
  normalizer = MinMaxScaler()
  X_norm = normalizer.fit_transform(X)
  chi_selector = SelectKBest(chi2, k=241)
  chi_selector.fit(X_norm, y)

  chi_support = chi_selector.get_support()
  selected_features = np.where(chi_support)[0]
  #chi_feature = X.loc[:,chi_support].columns.tolist()
  #print(str(len(selected_features)), 'selected features')
  return selected_features


# Split into training and test sets

In [9]:
# X = df[pre_processed_text]
X = embedded_sequences
y = df['hatespeech_comb']

RANDOM_STATE = 42

# Hold out
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = RANDOM_STATE)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Training

## LSTM

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tensorflow.keras.optimizers import RMSprop
from statistics import mean

In [11]:
epochs = 10
batch_size = 128
learning_rate=0.001

def lstm_model():
    model = Sequential()
    model.add(InputLayer(shape=(max_len, embedding_dim)))
    model.add(Dropout(0.2))
    model.add(LSTM(200, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1000,activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation = 'sigmoid'))
    adam = Adam(learning_rate=learning_rate, clipvalue=0.5)
    model.compile(optimizer=adam,loss='binary_crossentropy',metrics = ['binary_accuracy'])
    model.summary()
    return model

In [12]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, train_size=0.8, random_state=42)

results = []
f1 = []
accuracy = []

# X = df[pre_processed_text]
X = embedded_sequences
y = df['hatespeech_comb']

n_gram = 2

print("# Training")

for i, (train_index_cv, val_index) in enumerate(sss.split(X_train, y_train)):
    print(f"Folder :{i}")
    X_train_cv, X_val = X[train_index_cv], X[val_index]
    y_train_cv, y_val = y[train_index_cv], y[val_index]
    

    # Model
    model = KerasClassifier(model=lstm_model,
                        epochs=epochs,
                        batch_size=batch_size)
    
    # Fit
    model.fit(X_train_cv, y_train_cv)
    pred = model.predict(X_val)

    result = classification_report(y_val, pred)
    results.append(result)

    f = f1_score(y_val, pred)
    f1.append(f)
    print(f"# F1: {f}")

    acc = accuracy_score(y_val, pred)
    accuracy.append(acc)
    print(f"# Accuracy: {acc}")
    print("===============")

print("# Mean Accuracy: ", mean(accuracy))
print("# Mean F1: ", mean(f1))

# Training
Folder :0


Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 302ms/step - binary_accuracy: 0.6553 - loss: 0.6505
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 290ms/step - binary_accuracy: 0.7352 - loss: 0.5511
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 291ms/step - binary_accuracy: 0.7313 - loss: 0.5473
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 288ms/step - binary_accuracy: 0.7306 - loss: 0.5382
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 295ms/step - binary_accuracy: 0.7582 - loss: 0.5001
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 319ms/step - binary_accuracy: 0.7540 - loss: 0.5054
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 310ms/step - binary_accuracy: 0.7622 - loss: 0.4953
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 317ms/step - binary_accuracy: 0.767

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 361ms/step - binary_accuracy: 0.6630 - loss: 0.6573
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 335ms/step - binary_accuracy: 0.7059 - loss: 0.5762
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 298ms/step - binary_accuracy: 0.7076 - loss: 0.5726
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 367ms/step - binary_accuracy: 0.7444 - loss: 0.5250
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 392ms/step - binary_accuracy: 0.7475 - loss: 0.5154
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 392ms/step - binary_accuracy: 0.7441 - loss: 0.5230
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 382ms/step - binary_accuracy: 0.7587 - loss: 0.5069
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 402ms/step - binary_accuracy:

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 373ms/step - binary_accuracy: 0.6625 - loss: 0.6467
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 362ms/step - binary_accuracy: 0.7412 - loss: 0.5409
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 350ms/step - binary_accuracy: 0.7464 - loss: 0.5204
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 349ms/step - binary_accuracy: 0.7698 - loss: 0.5071
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 364ms/step - binary_accuracy: 0.7593 - loss: 0.4998
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 362ms/step - binary_accuracy: 0.7624 - loss: 0.4849
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 368ms/step - binary_accuracy: 0.7715 - loss: 0.4938
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 369ms/step - binary_accuracy

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 343ms/step - binary_accuracy: 0.6741 - loss: 0.6487
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 348ms/step - binary_accuracy: 0.7125 - loss: 0.5638
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 335ms/step - binary_accuracy: 0.7269 - loss: 0.5374
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 335ms/step - binary_accuracy: 0.7373 - loss: 0.5298
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 355ms/step - binary_accuracy: 0.7537 - loss: 0.5201
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 368ms/step - binary_accuracy: 0.7559 - loss: 0.5103
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 384ms/step - binary_accuracy: 0.7624 - loss: 0.4805
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 342ms/step - binary_accuracy

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 351ms/step - binary_accuracy: 0.6820 - loss: 0.6459
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 368ms/step - binary_accuracy: 0.7319 - loss: 0.5427
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 365ms/step - binary_accuracy: 0.7229 - loss: 0.5474
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 386ms/step - binary_accuracy: 0.7591 - loss: 0.5055
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 380ms/step - binary_accuracy: 0.7356 - loss: 0.5346
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 348ms/step - binary_accuracy: 0.7588 - loss: 0.4941
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 377ms/step - binary_accuracy: 0.7563 - loss: 0.5083
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 372ms/step - binary_accuracy

# Evaluation

In [13]:
model = KerasClassifier(model = lstm_model,
                        epochs = epochs,
                        batch_size = batch_size)

model.fit(X_train, y_train)
pred = model.predict(X_test)

result = classification_report(y_test, pred)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(result)

Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 352ms/step - binary_accuracy: 0.6393 - loss: 0.6509
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 333ms/step - binary_accuracy: 0.7117 - loss: 0.5651
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 348ms/step - binary_accuracy: 0.7494 - loss: 0.5242
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 370ms/step - binary_accuracy: 0.7384 - loss: 0.5233
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 351ms/step - binary_accuracy: 0.7061 - loss: 0.5595
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 355ms/step - binary_accuracy: 0.7450 - loss: 0.5161
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 366ms/step - binary_accuracy: 0.7573 - loss: 0.5092
Epoch 8/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 347ms/step - binary_accuracy