# Import Libs

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from statistics import mean

from nltk import word_tokenize
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

from sklearn.metrics import classification_report, f1_score, precision_recall_fscore_support, accuracy_score, confusion_matrix
from sklearn import svm

# Load dataset

In [2]:
# pre_processed_text = 'gemini_embedding'
# pre_processed_text = 'text_embed'
pre_processed_text = 'pre_processed_text'

In [3]:
df = pd.read_csv('./dataset/hsd_pre_processed.csv')

In [4]:
df

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha
...,...,...,...,...,...,...,...,...,...
5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...
5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra
5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher
5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...


# Word Embedding

## GloVe

In [5]:
GLOVE_MODEL_FILE = './dataset/glove.twitter.27B/glove.twitter.27B.100d.txt'
max_len = 128
embedding_dim = 100

# Tokenize
token = Tokenizer()
token.fit_on_texts(df['pre_processed_text'])
seq = token.texts_to_sequences(df['pre_processed_text'])

# Padding
pad_seq = pad_sequences(seq,maxlen=embedding_dim)

# Vocab size
vocab_size = len(token.word_index)+1

# Load embedding vector
embedding_vector = {}
f = open(GLOVE_MODEL_FILE)
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

1193514it [00:11, 102314.67it/s]


In [6]:
# Keep a out of vocabullary dict
oov_dict = {}

# Generate embedding matrix
embedding_matrix = np.zeros((vocab_size,embedding_dim))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value
    else:
        oov_dict[word] = np.random.uniform(-1., 1., (embedding_dim,)) # Generate new random vector
        embedding_matrix[i] = oov_dict[word]


# Transform text into embed vector
embedded_sequences = np.zeros((len(pad_seq), max_len, embedding_dim))
for i, seq in enumerate(pad_seq):
    for j, idx in enumerate(seq):
        if idx > 0:  # Skip padding index
            embedded_sequences[i, j] = embedding_matrix[idx]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:00<00:00, 610987.25it/s]


## Bag of Words

In [7]:
def bag_of_words(X_train, X_test, n_grams):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams))
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    return X_train,X_test

## CHI-2

In [8]:
X = df[pre_processed_text]
y = df['hatespeech_comb']


def feature_selection_chi2(X,y):
  normalizer = MinMaxScaler()
  X_norm = normalizer.fit_transform(X)
  chi_selector = SelectKBest(chi2, k=241)
  chi_selector.fit(X_norm, y)

  chi_support = chi_selector.get_support()
  selected_features = np.where(chi_support)[0]
  #chi_feature = X.loc[:,chi_support].columns.tolist()
  #print(str(len(selected_features)), 'selected features')
  return selected_features


# Split into training and testing sets

In [10]:
# X = df[pre_processed_text]
X = embedded_sequences
y = df['hatespeech_comb']

# Flatten
X = np.array([matrix.ravel() for matrix in X])

RANDOM_STATE = 42

# Hold out
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = RANDOM_STATE)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Training

## SVM

In [11]:
sss = StratifiedShuffleSplit(n_splits=5, test_size = 0.2, train_size = 0.8, random_state = 42)
model = svm.SVC(kernel='linear')

results = []
f1 = []
precision = []
recall = []
accuracy = []


n_gram = 2


# Define the model and parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],   # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel types
    'gamma': ['scale', 'auto']    # Kernel coefficient
}

grid_search = GridSearchCV(
    estimator=svm.SVC(),
    param_grid=param_grid,
    scoring='f1',  # Metric for evaluation
    cv=sss,
    verbose=1,  # Shows progress of the grid search
    n_jobs=-1  # Parallelize computations
)

print("# Performing Grid Search")
grid_search.fit(pd.DataFrame(X_train.tolist()), y_train)

# Best parameters and model after Grid Search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

score = grid_search.score(pd.DataFrame(X_train.tolist()), y_train)

print("Best Parameters:", best_params)
print("Best Model:", best_model)

# Performing Grid Search
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Best Model: SVC(C=100, gamma='auto')


# Evaluation

In [12]:
model = svm.SVC(kernel='rbf', gamma='auto', C=100)

model.fit(X_train, y_train)
pred = model.predict(X_test)

result = classification_report(y_test, pred)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(result)

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       776
           1       0.58      0.46      0.51       358

    accuracy                           0.73      1134
   macro avg       0.68      0.65      0.66      1134
weighted avg       0.71      0.73      0.72      1134

