This dataset is a Kaggle competition dataset(https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge), but we are going to use it to make a Gradio app!
Dataset contains comments and if those comments are either toxic or not.
Labels are multi-binary.

# Import data 

In [1]:
# working with different file paths
import os 
# read datasets. helping tabular data
import pandas as pd
# deep learning libraries
import tensorflow as tf
# gonne use it just one place (np.expand)
import numpy as np 

In [2]:
# import data
df = pd.read_csv(
os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv')
)

In [3]:
# take a look at the data
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# grabbing first comment to examine it 
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [5]:
# grabbing every column for the first row
df[df.columns[2:]].iloc[3]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 3, dtype: int64

# Preprocess

In [6]:
# for tokenization
from tensorflow.keras.layers import TextVectorization

In [7]:
# split data into comments and features
X = df['comment_text']
y = df[df.columns[2:]].values

In [8]:
# number of words in the vocabulary
MAX_WORDS = 250000

In [9]:
vectorizer = TextVectorization(max_tokens = MAX_WORDS,
                              output_sequence_length = 1800,
                              output_mode = 'int')

In [10]:
# teaching our vectorizer to df comments
vectorizer.adapt(X.values)

In [11]:
# numeric representation of a sentence
vectorizer('Hey there, General Kenobi')[:4]

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([   425,     41,    415, 178533], dtype=int64)>

In [12]:
# pass through all comments
vectorized_text = vectorizer(X.values)

In [13]:
# creating tf data pipeline
# map, chache, shuffle, batch, prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache() # caches data
dataset = dataset.shuffle(160000) # pass it how large our buffer size needs to be
dataset = dataset.batch(16) # each batch represented as series of 16 samples
dataset = dataset.prefetch(8) # helps bottlenecks

In [14]:
# create train test and validation
train = dataset.take(int(len(dataset)*.7)) # 70% of training
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # 70% of validation 
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # 90% of test

# Creating Seq Model

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [16]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_WORDS+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [17]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          8000032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [19]:
history = model.fit(train, epochs=1, validation_data=val)



In [23]:
# Make Predictions

In [24]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [25]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [26]:
res = model.predict(np.array([input_text]))



In [27]:
res.shape

(1, 6)

In [None]:
# Evaluate Model

In [28]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [29]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [30]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)















In [31]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')


Precision: 0.8644134998321533, Recall:0.6223876476287842, Accuracy:0.46940821409225464


In [None]:
# Test and Gradio

In [33]:
import tensorflow as tf
import gradio as gr

In [34]:
model.save('toxicity.h5')

In [35]:
model = tf.keras.models.load_model('toxicity.h5')

In [36]:
input_str = vectorizer('hey i freaken hate you!')

In [37]:
res = model.predict(np.expand_dims(input_str,0))



In [38]:
res

array([[0.71999145, 0.03037961, 0.31933436, 0.03126848, 0.30310085,
        0.06649949]], dtype=float32)

In [39]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [40]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')



In [41]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://16af1e53e989e5cfa2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




