In [1]:
import os 
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras

from keras.layers import TextVectorization, Embedding, LSTM, Bidirectional, Dense, Dropout
from keras.models import Sequential
from keras.metrics import Precision, Recall




In [2]:
df = pd.read_csv(os.path.join('csvFiles','train.csv','train.csv'))
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,racist
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Preprocessing Data

In [3]:
X = df['comment_text'] #all the act text
y = df[df.columns[2:]].values #the lavels of said text (toxic, super toxic, etc)

In [4]:
max_features = 200000  # number of words in the vacab,  
max_len = 1800  # Sequence length to pad the outputs to.

vectorizer = TextVectorization(max_tokens = max_features, output_sequence_length = max_len,output_mode = 'int')
# this means that it is going to map every single word to an integer value




In [5]:
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)
vectorized_text





<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)  # how large our buffer size needs to be
dataset = dataset.batch(16)  # each batch represented as a series of 16 samples
dataset = dataset.prefetch(8)  # helps prevent bottlenecks

Data Splitting

In [7]:
train = dataset.take(int(len(dataset)*.9))   # 90%
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.05))   # 5%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.05))  # 5%

In [None]:
len(train), len(val), len(test)

(8976, 498, 498)

Model

In [9]:
model = Sequential()

# Create the embedding layer 
model.add(Embedding(max_features+1, 32))

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(256, activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(128, activation='relu'))

# Final layer
model.add(Dense(6, activation='sigmoid'))

In [10]:
model.compile(optimizer='Adam', loss='BinaryCrossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                       

Training

In [11]:
hist = model.fit(train, epochs=10, validation_data=val, verbose=1)

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
loss, Acc = model.evaluate(test)

print('Test Loss: ', loss)
print('Test Accuracy: ', Acc)

Test Loss:  0.017629003152251244
Test Accuracy:  0.9897088408470154


Make Predictions

In [13]:
input_text = vectorizer('you suck')

res = model.predict(np.expand_dims(input_text, axis=0))



In [15]:
pre = Precision()
re = Recall()

for batch in test.as_numpy_iterator():
    # unpack the batch
    X_test, y_test = batch
    # make a prediction
    y_pred = model.predict(X_test)
    
    y_test = y_test.flatten()
    y_pred = y_pred.flatten()
    
    pre.update_state(y_test, y_pred)
    re.update_state(y_test, y_pred)



In [16]:
model.save('toxic.keras')
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}')

Precision: 0.9178004264831543, Recall:0.9126268029212952


In [17]:
model = tf.keras.models.load_model('toxic.keras')
input_str = vectorizer('you suck') # input box

In [19]:
(res > 0.5).astype(int)
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'racist'], dtype='object')

In [18]:
res = model.predict(np.expand_dims(input_str, 0))
res



array([[0.99190134, 0.07255366, 0.9797904 , 0.0042801 , 0.8960959 ,
        0.00735599]], dtype=float32)