In [8]:
import pandas as pd
import os

pasta_dados = 'dados'

dataframes = []

mapeamento_arquivos = {
    'aggression.csv': 1,
    'attack.csv': 2,
    'racism.csv': 3,
    'sexism.csv': 4,
    'toxicity.csv': 5
}


for arquivo in os.listdir(pasta_dados):
    if arquivo.endswith('.csv'):
        caminho_arquivo = os.path.join(pasta_dados, arquivo)
        df = pd.read_csv(caminho_arquivo)
        
        if arquivo in mapeamento_arquivos:
            tipo_texto = mapeamento_arquivos[arquivo]
            df['type_text'] = tipo_texto
        
        colunas_numericas = df.select_dtypes(include='number').columns
        df[colunas_numericas] = df[colunas_numericas].apply(pd.to_numeric, downcast='integer', errors='coerce')
        
        df['type_text_label'] = arquivo.replace(".csv", "")
        dataframes.append(df)

df_final = pd.concat(dataframes)
df_final.head()

Unnamed: 0,index,Text,ed_label_0,ed_label_1,oh_label,type_text,type_text_label,id,Annotation
0,0,`- This is not ``creative``. Those are the di...,0.9,0.1,0.0,1,aggression,,
1,1,` :: the term ``standard model`` is itself le...,1.0,0.0,0.0,1,aggression,,
2,2,"True or false, the situation as of March 200...",1.0,0.0,0.0,1,aggression,,
3,3,"Next, maybe you could work on being less cond...",0.555556,0.444444,0.0,1,aggression,,
4,4,This page will need disambiguation.,1.0,0.0,0.0,1,aggression,,


In [9]:
import pandas as pd

# Remover textos nulos ou sem valor
df_filtered = df_final.dropna(subset=['Text', 'oh_label'])

# Agrupar e contar a quantidade de itens por classe em cada DataFrame filtrado
grouped_counts = df_filtered.groupby(['oh_label', 'type_text_label']).size().reset_index(name='count')

# Filtrar apenas os grupos onde oh_label é igual a 1
filtro = grouped_counts['oh_label'] == 1
grouped_counts = grouped_counts.loc[filtro]

df_new = pd.DataFrame()

# Iterar sobre cada grupo
for group, count in grouped_counts.groupby(['oh_label', 'type_text_label']):
    # Obter o oh_label e type_text_label do grupo atual
    oh_label = group[0]
    type_text_label = group[1]
    
    filter_yes_bullying = (0, type_text_label)
    filter_not_bullying = (1, type_text_label)
    
    count_yes_bullying = count[count['type_text_label'] == type_text_label]['count'].values[0]
    count_not_bullying = count[count['type_text_label'] == type_text_label]['count'].values[0]
    
    group_yes_bullying = df_filtered[(df_filtered['oh_label'] == filter_yes_bullying[0]) & (df_filtered['type_text_label'] == filter_yes_bullying[1])].sample(n=count_yes_bullying, random_state=42)
    group_not_bullying = df_filtered[(df_filtered['oh_label'] == filter_not_bullying[0]) & (df_filtered['type_text_label'] == filter_not_bullying[1])].sample(n=count_not_bullying, random_state=42)
    
    df_new = pd.concat([df_new, group_yes_bullying])
    df_new = pd.concat([df_new, group_not_bullying])

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Carregar o conjunto de dados
df = df_new
print("Conjunto de dados carregado.")

# Pré-processamento de texto
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df['preprocessed_text'] = df['Text'].apply(preprocess_text)
print("Pré-processamento de texto concluído.")

df['preprocessed_text_rnn'] = df['preprocessed_text']
print("adicionando coluna de pre processamento para o rnn, pois a Vetorização e diferente")

# Vetorização TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = vectorizer.fit_transform(df['preprocessed_text'])
y = df['oh_label']
print("Vetorização TF-IDF concluída.")

# Divisão do conjunto de dados em treinamento, teste e validação
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, stratify=y_train, random_state=42)
print("Conjunto de dados dividido em treinamento, teste e validação.")

Conjunto de dados carregado.
Pré-processamento de texto concluído.
adicionando coluna de pre processamento para o rnn, pois a Vetorização e diferente
Vetorização TF-IDF concluída.
Conjunto de dados dividido em treinamento, teste e validação.


In [11]:
# Treinamento do modelo Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
print("Treinamento do modelo Naive Bayes concluído.")

Treinamento do modelo Naive Bayes concluído.


In [12]:
# Divisão do conjunto de dados em treinamento, teste e validação para RNN
X_rnn = df['preprocessed_text_rnn']
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(X_rnn, y, test_size=0.2, stratify=y, random_state=42)
X_train_rnn, X_val_rnn, y_train_rnn, y_val_rnn = train_test_split(X_train_rnn, y_train_rnn, test_size=0.125, stratify=y_train_rnn, random_state=42)

# Treinamento do Tokenizer e transformação dos dados em sequências
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train_rnn)

sequences_train = tokenizer.texts_to_sequences(X_train_rnn)
sequences_test = tokenizer.texts_to_sequences(X_test_rnn)
sequences_val = tokenizer.texts_to_sequences(X_val_rnn)

max_len = 100  # comprimento máximo das sequências
X_train_rnn = pad_sequences(sequences_train, maxlen=max_len)
X_test_rnn = pad_sequences(sequences_test, maxlen=max_len)
X_val_rnn = pad_sequences(sequences_val, maxlen=max_len)

# Convertendo labels para numpy arrays
y_train_rnn = np.array(y_train_rnn)
y_test_rnn = np.array(y_test_rnn)
y_val_rnn = np.array(y_val_rnn)

# Treinamento do modelo RNN
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len))
model.add(LSTM(units=128))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_rnn, y_train_rnn, validation_data=(X_val_rnn, y_val_rnn), epochs=5, batch_size=64)
print("Treinamento do modelo RNN concluído.")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Treinamento do modelo RNN concluído.


In [13]:
# Treinamento do modelo SVM com busca em grade
param_grid = {'C': [0.1, 1, 10],'kernel': ['linear', 'rbf'],'gamma': ['scale', 'auto']}
svm_classifier = SVC(probability=True)
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_svm_classifier = grid_search.best_estimator_
print("Treinamento do modelo SVM concluído.")

In [None]:
# Avaliação dos modelos
svm_predictions = best_svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
nb_predictions = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
rnn_predictions = model.predict(X_test)
rnn_predictions_labels = np.round(rnn_predictions).flatten()
rnn_accuracy = accuracy_score(y_test, rnn_predictions_labels)
print("Avaliação dos modelos concluída.")

# Exibição dos resultados
print('Acurácia SVM:', svm_accuracy)
print('Acurácia Naive Bayes:', nb_accuracy)
print('Acurácia RNN:', rnn_accuracy)