<a href="https://colab.research.google.com/github/dishabarmola/Toxicity_Classifier/blob/main/Comment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sklearn
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

VECTORISATION (PREPROCESSING DATA)

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
df['comment_text']


In [None]:
df[df.columns[2:]].values

In [None]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000 # NUM OF WORDS IN VOCABULARY

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,# max input length we are gonna pass
                               output_mode='int')

In [None]:
vectorizer.adapt(x.values)

In [None]:
vectorizer.get_vocabulary()

In [None]:
vectorizer('i am good')[:3]

In [None]:
vectorized_text = vectorizer(x.values)

In [None]:
vectorized_text

In [None]:
#data pipeline steps
# map caching shuffle batch prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)


In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

MODEL TRAINING AND CREATION

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
#  create embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))
# bidirectional lstm layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# feature extractor (relu is used generally to tackle non linearity and has no vanishing gradient problem)
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# final layer (sigmoid to keep value within range 0-1)
model.add(Dense(6, activation='sigmoid'))


In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=10, validation_data=val)

PLOT

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

PREDICTIONS

In [None]:
text = vectorizer('you bitch! how dare you speak to me like that')

In [None]:
res = model.predict(np.array([text]))

In [None]:
df.columns[2:]

In [None]:
res

In [None]:
res  = model.predict(np.expand_dims(text, 0))

In [None]:
res

In [None]:
from tensorflow.keras.metrics import Precision , Recall , CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()


In [None]:
for batch in test.as_numpy_iterator():
  # unpack the batch
  x_true , y_true = batch
  # prediction
  yhat = model.predict(x_true)
  # flatten the prediction
  y_true = y_true.flatten()
  yhat = yhat.flatten()
  # update these metrics
  pre.update_state(y_true, yhat)
  re.update_state(y_true, yhat)
  acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

GRADIO

In [None]:
!pip install gradio jinja2

In [None]:
import gradio as gr

SAVE MODEL

In [None]:
model.save('comment_toxicity.h5')

In [None]:
from google.colab import files
files.download('comment_toxicity.h5')

In [None]:
def commentor(comment):
  vec = vectorizer([comment])
  res = model.predict(vec)

  txt = ''
  for idx , col in enumerate(df.columns[2:]):
    txt += '{}: {}\n'.format(col, res[0][idx]>0.5)
  return txt

In [None]:
interface = gr.Interface(fn=commentor,
                         inputs=gr.Textbox(lines=2, placeholder='Comment here...'),
                         outputs='text')

In [None]:
interface.launch(share=True)