In [None]:
import numpy as numpy
import pandas as pd
import os

for dirname, _, filenames in os.walk('AuxData'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

from sklearn import preprocessing

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, MaxPool1D, Dropout, Dense, GlobalMaxPooling1D, Embedding, Activation
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')

In [None]:
train_data = pd.read_csv('AuxData/toxic_train.csv')
test_data = pd.read_csv('AuxData/toxic_test.csv')

In [None]:
train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()

In [None]:
test_data = test_data.drop(columns=['Unnamed: 0'])
test_data.head()

In [None]:
def preprocess_text(sen):
    # lower the character
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    stops = stopwords.words('english')
    
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
    return sentence

In [None]:
# preprocess data

train_data['comment_text'] = train_data['comment_text'].apply(lambda x : preprocess_text(x))
test_data['comment_text'] = test_data['comment_text'].apply(lambda x : preprocess_text(x))

In [None]:
# tokenize the data

token = Tokenizer(28164)
token.fit_on_texts(train_data['comment_text'])
text = token.texts_to_sequences(train_data['comment_text'])
text = pad_sequences(text, maxlen=100)

In [None]:
y = train_data['toxic'].values

In [None]:
# split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
# build the model

max_features = 28164
embedding_dim = 32

model = Sequential()
model.add(Embedding(max_features, embedding_dim))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()

In [None]:
# compile and train model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=1024, validation_data=(X_test, y_test), epochs=20)

In [None]:
plt.style.use('fivethirtyeight')

# Plot loss
plt.plot(history.history['loss'], label='Treino')
plt.plot(history.history['val_loss'], label='Validação')
plt.xlabel('\nÉpocas')
plt.ylabel('Perda\n')
plt.title('Perda por época (RNN)\n')
plt.legend()
plt.show()

In [None]:
# Plot accuracy
plt.plot(history.history['accuracy'], label='Treino')
plt.plot(history.history['val_accuracy'], label='Validação')
plt.xlabel('\nÉpocas')
plt.ylabel('Acurácia\n')
plt.title('Acurácia por época (RNN)\n')
plt.legend()
plt.show()

In [None]:
x_test = test_data['comment_text'].values
y_test = test_data['toxic'].values

In [None]:
from tensorflow.keras.preprocessing import sequence

x_test_tokenized = token.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized,maxlen=100)

In [None]:
y_pred = model.predict(x_testing,verbose=1,batch_size=32)

In [None]:
y_pred = [0 if y[0] < 0.5 else 1 for y in y_pred]

In [None]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

In [None]:
print("F1 Score: {:.6f}".format(f1_score(y_test, y_pred, average='macro')))

In [None]:
test_data['RNN_HS'] = y_pred
test_data.RNN_HS.value_counts()