In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os

In [19]:
pasta_dados = 'dados'

dataframes = []

mapeamento_arquivos = {
    'aggression.csv': 1,
    'attack.csv': 2,
    'racism.csv': 3,
    'sexism.csv': 4,
    'toxicity.csv': 5
}


for arquivo in os.listdir(pasta_dados):
    if arquivo.endswith('.csv'):
        caminho_arquivo = os.path.join(pasta_dados, arquivo)
        df = pd.read_csv(caminho_arquivo)
        
        if arquivo in mapeamento_arquivos:
            tipo_texto = mapeamento_arquivos[arquivo]
            df['type_text'] = tipo_texto
        
        # Converter apenas as colunas numéricas para inteiros
        colunas_numericas = df.select_dtypes(include='number').columns
        df[colunas_numericas] = df[colunas_numericas].apply(pd.to_numeric, downcast='integer', errors='coerce')
        
        df['type_text_label'] = arquivo.replace(".csv", "")
        dataframes.append(df)

df_final = pd.concat(dataframes)

# Balancear as classes e limitar a quantidade total de itens

# Separa as classes majoritária e minoritária
df_majority = df_final[df_final.oh_label==0]
df_minority = df_final[df_final.oh_label==1]

# Calcula a quantidade de itens para igualar as classes
n_samples = 1000 // 2

# Faz a subamostragem na classe majoritária
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=n_samples,     # to match minority class
                                 random_state=123) # reproducible results

# Faz a superamostragem na classe minoritária
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=n_samples,    # to match majority class
                                 random_state=123) # reproducible results

# Combina a classe minoritária com a classe majoritária
df_final = pd.concat([df_majority_downsampled, df_minority_upsampled])

print(df_final.head())
# print(df_final.head())

                       index  \
81847                  81847   
66583                  66583   
42466                  42466   
71897                  71897   
209    5.76464759282741E+017   

                                                    Text  ed_label_0  \
81847  ` **I am not sure which popularity you are ref...    1.000000   
66583    == diablada = bolivia ==  look at maps. this...    0.777778   
42466    == Re: You ==  Your blocking me for no reaso...    0.700000   
71897  big risks, yes, he bribed many persons, but he...    1.000000   
209       @cperciva @femfreq ah. well. no one's perfect.         NaN   

       ed_label_1  oh_label  type_text type_text_label                     id  \
81847    0.000000       0.0          2          attack                    NaN   
66583    0.222222       0.0          1      aggression                    NaN   
42466    0.300000       0.0          5        toxicity                    NaN   
71897    0.000000       0.0          2          at

In [20]:
colunas_desejadas = ['Text', 'oh_label', 'type_text', 'type_text_label']
data_frame = df_final[colunas_desejadas]
print(data_frame.head())

                                                    Text  oh_label  type_text  \
81847  ` **I am not sure which popularity you are ref...       0.0          2   
66583    == diablada = bolivia ==  look at maps. this...       0.0          1   
42466    == Re: You ==  Your blocking me for no reaso...       0.0          5   
71897  big risks, yes, he bribed many persons, but he...       0.0          2   
209       @cperciva @femfreq ah. well. no one's perfect.       0.0          4   

      type_text_label  
81847          attack  
66583      aggression  
42466        toxicity  
71897          attack  
209            sexism  


In [21]:
# Remova valores nulos das colunas 'Text' e 'type_text'
data_frame = data_frame.dropna(subset=['Text', 'oh_label'])

# Verifique a quantidade de itens duplicados antes da remoção
qtd_duplicados = data_frame.duplicated(subset=['Text']).sum()
print("Quantidade de itens duplicados antes da remoção:", qtd_duplicados)

# Remova itens duplicados pela coluna 'Text'
data_frame = data_frame.drop_duplicates(subset=['Text'])

# Verifique a quantidade de itens duplicados após a remoção
qtd_duplicados = data_frame.duplicated(subset=['Text']).sum()
print("Quantidade de itens duplicados após a remoção:", qtd_duplicados)

# Exiba o dataframe resultante
print("DataFrame sem itens duplicados:")
print(data_frame.head())

Quantidade de itens duplicados antes da remoção: 7
Quantidade de itens duplicados após a remoção: 0
DataFrame sem itens duplicados:
                                                    Text  oh_label  type_text  \
81847  ` **I am not sure which popularity you are ref...       0.0          2   
66583    == diablada = bolivia ==  look at maps. this...       0.0          1   
42466    == Re: You ==  Your blocking me for no reaso...       0.0          5   
71897  big risks, yes, he bribed many persons, but he...       0.0          2   
209       @cperciva @femfreq ah. well. no one's perfect.       0.0          4   

      type_text_label  
81847          attack  
66583      aggression  
42466        toxicity  
71897          attack  
209            sexism  


In [22]:
# verifica se os pacotes do nltk já foram baixados
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('wordnet')
    nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))  # Coloca as stopwords num conjunto para buscar mais rápido

def process_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # remove caracteres não-alfabéticos
    text = text.lower()  # transforma o texto para minúsculo
    words = nltk.word_tokenize(text)  # tokeniza o texto em palavras
    words = [word for word in words if word not in stop_words]  # remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # lematiza as palavras
    return ' '.join(words)

data_frame['Text'] = data_frame['Text'].apply(process_text)
data_frame.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gerson.prudencio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gerson.prudencio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerson.prudencio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,oh_label,type_text,type_text_label
81847,sure popularity referring please clarify recen...,0.0,2,attack
66583,diablada bolivia look map show truth oruro sta...,0.0,1,aggression
42466,blocking reason fell unfail abuse continue fig...,0.0,5,toxicity
71897,big risk yes bribed many person also many good...,0.0,2,attack
209,cperciva femfreq ah well one perfect,0.0,4,sexism


In [23]:
# Separe os campos de entrada (X) e o campo de saída (y)
X = data_frame['Text']
y = data_frame['oh_label']

# Divisão dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=123)

# Combine-os novamente para realizar a amostragem
test_data = pd.concat([X_test, y_test], axis=1)

# Separe as classes minoritárias e majoritárias
minority_test = test_data[test_data.oh_label==1]
majority_test = test_data[test_data.oh_label==0]

# Subamostragem na classe majoritária do conjunto de teste
majority_downsampled_test = resample(majority_test, 
                                     replace=False, 
                                     n_samples=len(minority_test), 
                                     random_state=123)

# As amostras que não foram incluídas na subamostragem serão adicionadas ao conjunto de treinamento
majority_train_extra = majority_test.loc[~majority_test.index.isin(majority_downsampled_test.index)]

# Combine a classe minoritária com a classe majoritária subamostrada
balanced_test = pd.concat([minority_test, majority_downsampled_test])

# Combine o conjunto de treinamento existente com os exemplos extra da classe majoritária
train_data = pd.concat([X_train, y_train], axis=1)
balanced_train = pd.concat([train_data, majority_train_extra])

# Separe X_test, y_test, X_train e y_train
X_test = balanced_test['Text']
y_test = balanced_test['oh_label']
X_train = balanced_train['Text']
y_train = balanced_train['oh_label']

print('Training Data :', X_train.shape)
print('Testing Data : ', X_test.shape)

Training Data : (667,)
Testing Data :  (326,)


In [24]:
labelList = []
resultList = []

In [25]:
# Crie o vetorizador TF-IDF
vectorizer = TfidfVectorizer()

# Ajuste o vetorizador e transforme os dados de treinamento
X_train_vec = vectorizer.fit_transform(X_train)

# Transforme os dados de teste
X_test_vec = vectorizer.transform(X_test)

In [26]:
# Crie o classificador Naive Bayes
nb = MultinomialNB()

# Ajuste o modelo
nb.fit(X_train_vec, y_train)

# Calcule a acurácia no conjunto de teste
print("Acurácia do algoritmo Naive Bayes: ", nb.score(X_test_vec, y_test))

# adicionando resultado e label nas listas
labelList.append("Naive_Byes")
resultList.append(nb.score(X_test_vec, y_test))

Acurácia do algoritmo Naive Bayes:  0.803680981595092


In [27]:
svm = SVC(random_state=3)
svm.fit(X_train_vec, y_train)
print("print accuracy of svm algo: ",svm.score(X_test_vec, y_test))

# adding result and label to lists
labelList.append("SVM")
resultList.append(svm.score(X_test_vec, y_test))

print accuracy of svm algo:  0.7760736196319018


In [28]:
# Crie o classificador Random Forest
rf = RandomForestClassifier(n_jobs=-1)

# Ajuste o modelo
rf.fit(X_train_vec, y_train)

# Calcule a acurácia no conjunto de teste
print("Acurácia do algoritmo Random Forest: ", rf.score(X_test_vec, y_test))

# adicionando resultado e label nas listas
labelList.append("Random_Forest")
resultList.append(rf.score(X_test_vec, y_test))

Acurácia do algoritmo Random Forest:  0.7116564417177914


In [35]:
# Preparando a entrada para o modelo RNN
# O número máximo de palavras a serem usadas. (mais frequentes)
MAX_NB_WORDS = 50000

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250

# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(df['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(df['Text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['oh_label']).values
print('Shape of label tensor:', Y.shape)

# Dividindo os dados em treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

# Definindo o modelo RNN

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

# Tentar usar diferentes otimizadores e diferentes configurações de otimizador
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treinando o modelo RNN
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.1)

# Avaliando o modelo
train_loss, train_acc = model.evaluate(X_train, Y_train, verbose=0)
print('Train Accuracy: %.3f' % train_acc)

test_loss, test_acc = model.evaluate(X_test, Y_test, verbose=0)
print('Test Accuracy: %.3f' % test_acc)

# Adicionando resultado e label nas listas
labelList.append("RNN")
resultList.append(test_acc)

Found 206803 unique tokens.
Shape of data tensor: (159686, 250)
Shape of label tensor: (159686, 2)
(143717, 250) (143717, 2)
(15969, 250) (15969, 2)
Epoch 1/5

KeyboardInterrupt: 