# Import Libs

In [1]:
import pandas as pd
import numpy as np
from statistics import mean

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, confusion_matrix


import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import InputLayer, Dense, Dropout
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

# Load dataset

In [2]:
df = pd.read_csv('./dataset/hsd_pre_processed.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3,pre_processed_text
0,0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E,cara vive outro mundo mundo real refugiados vi...
1,1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C,incompetentes cuidam povo brasileiro poucos re...
2,2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E,cumpanhero quebraram toda regras
3,3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D,conseguem pensar sentido lato além vê frente o...
4,4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E,bom dia macaco branco haha
...,...,...,...,...,...,...,...,...,...,...
5665,5665,@zecarlosantos2 é o unico que nao se corrompe....,0,1,C,0.0,B,0,A,unico nao corrompenao vende chega aroporto apl...
5666,5666,"@zqkitowz sei das cotas, mas não sabia disso, ...",1,1,D,1.0,It,0,A,sei cotas sabia disso putaria porra
5667,5667,"@zqkitowz sim, a maioria do eleitorado é mulhe...",0,0,C,0.0,V,0,C,sim maioria eleitorado mulher
5668,5668,"@zurcju seguir no tt é facíl, apresentar as am...",1,1,C,1.0,S,0,A,seguir tt facíl apresentar amigas sapatão ngm ...


# Feature Selection

## CHI-2

In [4]:
X = df['pre_processed_text']
y = df['hatespeech_comb']


def feature_selection_chi2(X,y):
  normalizer = MinMaxScaler()
  X_norm = normalizer.fit_transform(X)
  chi_selector = SelectKBest(chi2, k=241)
  chi_selector.fit(X_norm, y)

  chi_support = chi_selector.get_support()
  selected_features = np.where(chi_support)[0]
  #chi_feature = X.loc[:,chi_support].columns.tolist()
  #print(str(len(selected_features)), 'selected features')
  return selected_features


## Bag of Words

In [5]:
def bag_of_words(X_train, X_test, n_grams):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams))
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    return X_train,X_test

# Training

## MLP

In [6]:
activation_func = 'relu'
epochs = 50
learning_rate = 0.001
max_len = 50000


def mlp():
    model = Sequential()
    model.add(InputLayer(shape=(max_len,)))
    model.add(Dense(units = 100, activation = activation_func, kernel_initializer = 'random_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(units = 50, activation = activation_func, kernel_initializer = 'random_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    
    adam = Adam(learning_rate = learning_rate, clipvalue = 0.5)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['binary_accuracy'])    
    return model

model = KerasClassifier(model = mlp,
                        epochs = epochs,
                        batch_size = 5)

In [7]:
sss = StratifiedShuffleSplit(n_splits=5, test_size = 0.2, train_size = 0.8, random_state=42)

results = []
f1 = []
precision = []
recall = []
accuracy = []

X = df['pre_processed_text']
y = df['hatespeech_comb']
n_gram = 2


print("# Training")


folders = sss.split(X, y)
for train_index, test_index in folders:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # bag of words
    X_train, X_test = bag_of_words(X_train, X_test, n_gram)

    # Padding
    X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
    X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

    # Fit
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    result = classification_report(y_test, pred)
    results.append(result)

    f = f1_score(y_test, pred)
    f1.append(f)
    print(f"# F1: {f}")

    acc = accuracy_score(y_test, pred)
    accuracy.append(acc)
    print(f"# Accuracy: {acc}")
    print("===============")



print("# Mean Accuracy: ", mean(accuracy))
print("# Mean F1: ", mean(f1))


# Training
Epoch 1/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - binary_accuracy: 0.7006 - loss: 0.6146
Epoch 2/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accuracy: 0.9313 - loss: 0.1831
Epoch 3/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accuracy: 0.9882 - loss: 0.0408
Epoch 4/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accuracy: 0.9908 - loss: 0.0255
Epoch 5/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accuracy: 0.9934 - loss: 0.0160
Epoch 6/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accuracy: 0.9957 - loss: 0.0111
Epoch 7/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accuracy: 0.9985 - loss: 0.0045
Epoch 8/50
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - binary_accur