In [None]:
# 0. Install Dependencies and Bring in Data

In [None]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
df = pd.read_csv(os.path.join('new', 'train.csv'))

In [None]:
# 1. Preprocess data

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000     # number of words in the vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [None]:
#MCSHBAP - map, cache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)   # helps prevent bottlenecks

In [None]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
train_generator = train.as_numpy_iterator()

In [None]:
train_generator.next()

(array([[  256,    36,    72, ...,     0,     0,     0],
        [ 3680,   995,  1288, ...,     0,     0,     0],
        [46959,  3513,   171, ...,     0,     0,     0],
        ...,
        [    8,    55,    72, ...,     0,     0,     0],
        [    8,    19,     6, ...,     0,     0,     0],
        [32445,  7392,   383, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

In [None]:
# 2. Create Sequential Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()

# Create the embedding layer
model.add(Embedding(MAX_FEATURES + 1, 32))

# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))

# Final layer
model.add(Dense(6, activation = 'sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
history = model.fit(train, epochs = 1, validation_data=val)



In [None]:
# 3. Make Predictions 

In [None]:
input_text = vectorizer("Fuck you")

In [None]:
batch = test.as_numpy_iterator().next()

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)



array([[1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [None]:
res = model.predict(np.expand_dims(input_text, 0))



In [None]:
# 4. Evaluate Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():

  # Unpack the batch
  X_true, y_true = batch

  # Make a prediction
  yhat = model.predict(X_true)

  # Flatten the predictions
  y_true = y_true.flatten()
  yhat = yhat.flatten()

  pre.update_state(y_true, yhat)
  re.update_state(y_true, yhat)
  acc.update_state(y_true, yhat)

In [43]:
print(f"Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Categorical Accuracy: {acc.result().numpy()}")

Precision: 0.779793918132782, Recall: 0.7248427867889404, Categorical Accuracy: 0.4774323105812073


In [44]:
# 5. Test and Gradio

In [None]:
!pip install gradio jinja2

In [46]:
import gradio as gr

In [47]:
model.save('toxicity.h5')

In [48]:
model = tf.keras.models.load_model('toxicity.h5')

In [55]:
input_str = vectorizer("Hey I freaking hate you!")

In [56]:
res = model.predict(np.expand_dims(input_str, 0))



In [57]:
res > 0.5

array([[ True, False,  True, False, False, False]])

In [59]:
def score_comment(comment):
  vectorized_comment = vectorizer([comment])
  results = model.predict(vectorized_comment)

  text = ''
  for idx, col in enumerate(df.columns[2:]):
    text += '{}: {}\n'.format(col, results[0][idx] > 0.5)

  return text

In [None]:
interface = gr.Interface(fn = score_comment, inputs = gr.inputs.Textbox(lines = 2, placeholder = 'Comment to score'), outputs = 'text')

In [61]:
interface.launch(share = True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://175b565a-c061-4c67.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


