In [None]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
plt.style.use('fivethirtyeight')
print(tf.__version__)

import os
for dirname, _, filenames in os.walk('AuxData'):
    for filename in filenames:
        print(filename)

In [None]:
# Load data
train_df = pd.read_csv('AuxData/toxic_train.csv')
train_df.head()

In [None]:
train_df.sample(10,random_state=1)

In [None]:
x = train_df['comment_text']
y = train_df['toxic']

In [None]:
# View some toxic comments
train_df[train_df.toxic==1].sample(5)

In [None]:
# Plot frequency of toxic comments
fig = sns.distplot(train_df['toxic'], kde=False)
plt.xlabel("\nClasse")
plt.ylabel("Comentários\n")
plt.title("Distribuição dos comentários tóxicos\n")
plt.show(fig)

In [None]:
train_df['toxic'].value_counts()

In [None]:
max_features = 20000
max_text_length = 400

In [None]:
x_tokenizer = text.Tokenizer(max_features)
x_tokenizer.fit_on_texts(list(x))

In [None]:
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val= sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

In [None]:
embedding_dim =100
embeddings_index = dict()
f = open('../Datasets/hate-speech-detection/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word]= coefs
f.close()
print(f'Found {len(embeddings_index)} word vectors')

In [None]:
embedding_matrix= np.zeros((max_features,embedding_dim))
for word, index in x_tokenizer.word_index.items():
    if index>max_features-1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index]= embedding_vector

# Building Model

In [None]:
filters     = 250
kernel_size = 3
hidden_dims = 250

In [None]:
model = Sequential()
model.add(Embedding(max_features,
                    embedding_dim,
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                    trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters,
                 5,
                 padding='valid',
                 activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))
model.summary()

# Compiling the Model

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
x_train,x_val,y_train,y_val = train_test_split(x_train_val,y,test_size=0.3)

In [None]:
batch_size= 32
epochs = 5
hist = model.fit(x_train,y_train,
                    batch_size= batch_size,
                    epochs=epochs,
                    validation_data=(x_val,y_val))

In [None]:
# Plot loss
plt.plot(hist.history['loss'], label='Treino')
plt.plot(hist.history['val_loss'], label='Validação')
plt.xlabel('\nÉpocas')
plt.ylabel('Perda\n')
plt.title('Perda por época (CNN)\n')
plt.legend()
plt.show()

In [None]:
# Plot accuracy
plt.plot(hist.history['accuracy'], label='Treino')
plt.plot(hist.history['val_accuracy'], label='Validação')
plt.xlabel('\nÉpocas')
plt.ylabel('Acurácia\n')
plt.title('Acurácia por época (CNN)\n')
plt.legend()
plt.show()

In [None]:
test_df = pd.read_csv('AuxData/toxic_test.csv')

In [None]:
test_df.columns

In [None]:
x_test = test_df['comment_text'].values
y_test = test_df['toxic'].values

In [None]:
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized,maxlen=max_text_length)

In [None]:
y_pred = model.predict(x_testing,verbose=1,batch_size=32)

In [None]:
y_pred = [0 if y[0] < 0.5 else 1 for y in y_pred]

In [None]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

In [None]:
print("F1 Score: {:.6f}".format(f1_score(y_test, y_pred, average='macro')))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
plt.title("Matriz de Confusão")
plt.xlabel("\nRedes Neurais Convolucionais")
plt.ylabel("Valor real\n")

In [None]:
test_df['CNN_HS'] = y_pred
test_df.CNN_HS.value_counts()