# Importing Depenedncies

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from matplotlib import pyplot as plt
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

# Importing Data

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Toxic-Comment/train.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Preprocessing

Creating Dependent and Independent variable.

In [4]:
X = df['comment_text']
y = df[df.columns[2:]].values

## Vectorization

Turning sentences into numerical representation to feed it into neural network.

In [5]:
MAX_FEATURES = 200000 # number of words in the vocab

In [6]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800, #Capping the sentence into 1800 words.
                               output_mode='int')

Vectorizer model learning from the data.

In [7]:
vectorizer.adapt(X.values)

In [8]:
vectorizer('Hello, World!')[:3:]

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([288, 263,   0])>

Taking one random sentence for example to see how the vectorizer model works. The word 'Hello' is represented with the number 288 while the 'World' is reprsented with the number 263. The vector contains 1800 columns means the max length of sentence is 1800. 

Create the dataset with all the training dataset comment vectorized.


In [9]:
vectorized_text = vectorizer(X.values)

Seeing how the vectorized training data looks.

In [10]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

If the particular sentence doesn't meet the max length of 1800, it will pad out the columns with zeroes.


Create dataset by slicing the data into baches and shuffling it.

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

## Train, validation, Test split.

In [12]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

Using 70% of the data as training set, 20% of the data as validation set and 10% as testing set.

# Training the model

## Building Sequential model

In [13]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer model.compile(loss='BinaryCrossentropy', optimizer='Adam')
model.add(Dense(6, activation='sigmoid'))

Using Bi-directional LSTM. So the data can be feed to the network from both side. 

Adavantage: 

Example : "I don't hate you."

The network could see the word hate and think of it as the hate comment but by feeding the data with both direction it could catch the word 'Don't'

In [14]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [16]:
history = model.fit(train, epochs=1, validation_data=val)



Testing on our text by getting the percentage of the toxicity from different classes.


In [36]:
text = vectorizer("Hey, bitch. You freaking suck!")

input_text = tf.convert_to_tensor(np.array([text]))

res = model.predict(input_text)[0]

for i, j in zip(df.columns[2:], res):

  print(i + ': ' + str(j))

toxic: 0.99936336
severe_toxic: 0.43180236
obscene: 0.98312193
threat: 0.047626756
insult: 0.910321
identity_hate: 0.29512954


# Evaluating the model

In [18]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [19]:
for batch in test.as_numpy_iterator(): 
  
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [20]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8320522904396057, Recall:0.6178653240203857, Accuracy:0.48445335030555725


# Deplying the model from gradio app

Gradio is the fastest way to demo your machine learning model with a friendly web interface so that anyone can use it, anywhere for temporary amount of time.

In [None]:
import gradio as gr

In [39]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, round(results[0][idx]*100, 2))
    
    return text

Example of how the text would look in web interface.

In [40]:
print(score_comment('I hate you so much!'))

toxic: 60.45
severe_toxic: 1.17
obscene: 21.9
threat: 2.32
insult: 32.41
identity_hate: 5.38



In [41]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to classify classes of toxicity.'),
                        outputs='text')



This is how we will get the result in the web interface.
The box in write take input comment and we will get the output in the right box with the percentage of the toxicity from a different class.

In [42]:
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://58509.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<gradio.routes.App at 0x7faf598b6e10>,
 'http://127.0.0.1:7862/',
 'https://58509.gradio.app')