# Import Libs

In [1]:
import pandas as pd
import numpy as np
from statistics import mean

from nltk import word_tokenize
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

from sklearn.metrics import classification_report, f1_score, precision_recall_fscore_support, accuracy_score, confusion_matrix
from sklearn import svm

# Load dataset

In [2]:
pre_processed_text = 'gemini_embedding'
# pre_processed_text = 'text_embed'
# pre_processed_text = 'pre_processed_text'

In [3]:
df = pd.read_csv('./dataset/hsd_pre_processed_gemini.csv')

In [4]:
df

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text,text_glove,gemini_embedding
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...,[[-0.11667 -0.5588 0.55324 ... -0.68018 ...,"[-0.015683623, -0.03517303, -0.01821586, -0.00..."
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...,[[ 2.9807e-01 -8.7606e-01 5.7167e-01 -1.5141e...,"[0.024728414, 0.013021446, -0.039535552, 0.015..."
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras,[[ 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e...,"[-0.02794479, 0.012976087, -0.06279022, -0.012..."
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...,[[ 2.8124e-01 -3.2116e-01 1.0204e+00 5.7950e...,"[-0.020720925, 0.055354297, -0.028093176, -0.0..."
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha,[[ 3.6336e-01 -2.3872e-01 1.2917e-01 -5.1607e...,"[-0.07149509, -0.0039677643, 0.014008735, -0.0..."
...,...,...,...,...,...,...,...,...,...,...,...
5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...,[[-4.3466e-01 -2.6563e-01 -1.7129e-01 4.0573e...,"[0.0044391938, 0.023174051, 0.020349946, -0.00..."
5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra,[[-1.1048e-01 -1.0303e+00 5.2820e-01 7.5377e...,"[-0.06403098, 0.0318813, -0.017783472, -0.0311..."
5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher,[[ 5.4497e-01 -1.0734e+00 4.8874e-03 6.1676e...,"[0.015019704, 0.018858356, 0.029902609, -0.020..."
5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...,[[ 1.9611e-01 -2.5271e-01 1.7712e-02 -8.3988e...,"[-0.04061569, 0.021848962, 0.013160055, -0.010..."


In [5]:
df['gemini_embedding'] = df['gemini_embedding'].apply(lambda t: np.asarray(t.split(',')[1:-1], dtype='float32'))

# Word Embedding

## CHI-2

In [6]:
X = df[pre_processed_text]
y = df['hatespeech_comb']


def feature_selection_chi2(X,y):
  normalizer = MinMaxScaler()
  X_norm = normalizer.fit_transform(X)
  chi_selector = SelectKBest(chi2, k=241)
  chi_selector.fit(X_norm, y)

  chi_support = chi_selector.get_support()
  selected_features = np.where(chi_support)[0]
  #chi_feature = X.loc[:,chi_support].columns.tolist()
  #print(str(len(selected_features)), 'selected features')
  return selected_features


## Bag of Words

In [7]:
def bag_of_words(X_train, X_test, n_grams):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams))
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    return X_train,X_test

## Word2Vec - GloVe

In [8]:
GLOVE_MODEL_FILE = './dataset/glove.twitter.27B/glove.twitter.27B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]  # The word
            vector = np.asarray(values[1:], dtype='float32')  # The embedding vector
            embeddings_index[word] = vector
    return embeddings_index

embeddings_index = load_glove_embeddings(GLOVE_MODEL_FILE)
print(f"Loaded {len(embeddings_index)} word vectors.")

Loaded 1193514 word vectors.


In [9]:
df['text_embed'] = df['pre_processed_text'].apply(word_tokenize)

def embed_text(tokens, embeddings_index, embedding_dim=100):
    embeddings = []
    for word in tokens:
        vector = embeddings_index.get(word)
        if vector is not None:
            embeddings.append(vector)
        else:
            # Handle out-of-vocabulary (OOV) words
            embeddings.append(np.zeros(embedding_dim))
    return np.array(embeddings)

df['text_embed'] = df['text_embed'].apply(lambda text: embed_text(text, embeddings_index))

In [10]:
def aggregate_embeddings(embeddings):
    return np.mean(embeddings, axis=0)  # Average of embeddings

df['text_embed'] = df['text_embed'].apply(aggregate_embeddings)

In [11]:
df

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text,text_glove,gemini_embedding,text_embed
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...,[[-0.11667 -0.5588 0.55324 ... -0.68018 ...,"[-0.03517303, -0.01821586, -0.0073445886, 0.00...","[-0.018029999, -0.41711682, 0.19664483, 0.1132..."
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...,[[ 2.9807e-01 -8.7606e-01 5.7167e-01 -1.5141e...,"[0.013021446, -0.039535552, 0.015361794, 0.027...","[0.27201715, -0.2679503, 0.5175529, -0.2773625..."
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras,[[ 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e...,"[0.012976087, -0.06279022, -0.012183318, 0.030...","[0.2307099997997284, -0.21351999789476395, 0.6..."
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...,[[ 2.8124e-01 -3.2116e-01 1.0204e+00 5.7950e...,"[0.055354297, -0.028093176, -0.033058286, -0.0...","[0.28198162, -0.2943199, 0.4647464, 0.0873385,..."
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha,[[ 3.6336e-01 -2.3872e-01 1.2917e-01 -5.1607e...,"[-0.0039677643, 0.014008735, -0.03555642, -0.0...","[0.127281, -0.2522532, -0.064265996, -0.005243..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...,[[-4.3466e-01 -2.6563e-01 -1.7129e-01 4.0573e...,"[0.023174051, 0.020349946, -0.005635532, 0.024...","[-0.16767250001430511, -0.21430375147610903, 0..."
5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra,[[-1.1048e-01 -1.0303e+00 5.2820e-01 7.5377e...,"[0.0318813, -0.017783472, -0.031150207, -0.008...","[0.3789598, -0.52185357, 0.3604633, 0.38309836..."
5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher,[[ 5.4497e-01 -1.0734e+00 4.8874e-03 6.1676e...,"[0.018858356, 0.029902609, -0.02058325, 0.0388...","[0.10914001, -0.79044, 0.32813936, -0.037151, ..."
5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...,[[ 1.9611e-01 -2.5271e-01 1.7712e-02 -8.3988e...,"[0.021848962, 0.013160055, -0.0104468195, 0.01...","[-0.023928672, -0.32563555, 0.03620155, 0.0947..."


# Split into training and testing sets

In [12]:
X = df[pre_processed_text]
y = df['hatespeech_comb']

RANDOM_STATE = 42

# Hold out
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = RANDOM_STATE)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Training

## SVM

In [13]:
sss = StratifiedShuffleSplit(n_splits=5, test_size = 0.2, train_size = 0.8, random_state = 42)
model = svm.SVC(kernel='linear')

results = []
f1 = []
precision = []
recall = []
accuracy = []


n_gram = 2


# Define the model and parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],   # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel types
    'gamma': ['scale', 'auto']    # Kernel coefficient
}

grid_search = GridSearchCV(
    estimator=svm.SVC(),
    param_grid=param_grid,
    scoring='f1',  # Metric for evaluation
    cv=sss,
    verbose=1,  # Shows progress of the grid search
    n_jobs=-1  # Parallelize computations
)

print("# Performing Grid Search")
grid_search.fit(pd.DataFrame(X_train.tolist()), y_train)

# Best parameters and model after Grid Search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

score = grid_search.score(pd.DataFrame(X_train.tolist()), y_train)

print("Best Parameters:", best_params)
print("Best Model:", best_model)

# Performing Grid Search
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Model: SVC(C=10)


# Evaluation

In [14]:
model = svm.SVC(kernel='rbf', gamma='scale', C=10)


# X_train, X_test = bag_of_words(X_train, X_test, n_gram)
# selected_feature_list = feature_selection_chi2(X_train, y_train)

# X_train = X_train[:,[i for i in selected_feature_list]]
# X_test = X_test[:,[i for i in selected_feature_list]]

model.fit(pd.DataFrame(X_train.tolist()), y_train)
pred = model.predict(pd.DataFrame(X_test.tolist()))

result = classification_report(y_test, pred)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(result)

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       776
           1       0.59      0.48      0.53       358

    accuracy                           0.73      1134
   macro avg       0.69      0.66      0.67      1134
weighted avg       0.72      0.73      0.72      1134

