<h3>Import Dependencies</h3>

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

<h3>Read Data</h3>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

<h3>Preprocess</h3>

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000 #no. of words in vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
#Data Pipeline
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y)) #passing through data
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) #prevent bottlenecks

In [None]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [None]:
train = dataset.take(int(len(dataset)*.7)) #70%
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) #next 20%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) #leftover 10%

In [None]:
train_generator = train.as_numpy_iterator()
train_generator.next()

<h3>Creating Sequential Data</h3>

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from keras.api._v2.keras import activations

In [None]:
#Artificial Neural Network
model = Sequential()

#Embedding/Input Layer
model.add(Embedding(MAX_FEATURES+1, 32))

#LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh'))) #passing backward and forward

#Fully Connected Layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

#Final layer
model.add(Dense(6,activation='sigmoid')) # sigmoid :- values b/w 0-1

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
#Train the model

history = model.fit(train, epochs=1, validation_data = val)



<h3>Make Preditions</h3>

In [None]:
input_text = vectorizer('You freaking suck!')

FailedPreconditionError: ignored

In [None]:
batch = test.as_numpy_iterator().next() #Input Shape

In [None]:
res = model.predict(np.expand_dims(input_text,0))

In [None]:
res

array([[0.98750544, 0.19482045, 0.90841395, 0.02767715, 0.75693005,
        0.10827221]], dtype=float32)

In [None]:
df.columns[2:] #Labels (0.5 = threshold value)

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

<h3>Evaluating Model</h3>

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  X_true, y_true = batch
  yhat = model.predict(X_true)

  y_true = y_true.flatten()
  yhat = yhat.flatten()

  pre.update_state(y_true, yhat)
  re.update_state(y_true, yhat)
  acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.8478260636329651, Recall: 0.6419752836227417, Accuracy: 0.4653961956501007


<h3>Test</h3>

In [None]:
!pip install gradio jinja2

In [None]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/toxicity.h5')

In [None]:
input_str = vectorizer('I forgive but never forget')

FailedPreconditionError: ignored

In [None]:
res = model.predict(np.expand_dims(input_str, 0))

In [None]:
print(res)
print(df.columns[2:])

[[0.84924734 0.03097013 0.48790324 0.02202311 0.444767   0.05522211]]
Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')


In [None]:
def score_comment(comment):
  vectorized_comment = vectorizer([comment])
  results = model.predict(vectorized_comment)

  text = ''
  for idx, col, in enumerate(df.columns[2:]):
    text += '{}: {}\n'.format(col, results[0][idx]>0.5)
  return text

In [None]:
#Gradio Interface
interface = gr.Interface(fn=score_comment,
                         inputs=gr.inputs.Textbox(lines=2, placeholder = 'Comment to evaluate'),
                         outputs='text')

In [None]:
interface.launch(share=True)