# Comment Toxicity Detection Model

### 1) Import the Environment and Data

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(os.path.join('data','train.csv','train.csv'))

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### 2) Preprocessing

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [6]:
Max_Features = 200000 # number of words in vocab

In [7]:
vectorizer = TextVectorization(max_tokens=Max_Features, output_sequence_length=1800,output_mode='int')

In [8]:
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)

In [9]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [10]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [11]:
train = dataset.take(int(len(dataset))*7)
val = dataset.skip((int(len(dataset))*7)).take(int(len(dataset))*2)
test = dataset.skip((int(len(dataset))*9)).take(int(len(dataset))*1)

In [12]:
train_generator = train.as_numpy_iterator()

In [13]:
train_generator.next()

(array([[    32,    198,      8, ...,      0,      0,      0],
        [    57,    101,     16, ...,      0,      0,      0],
        [  4369,     71,   7951, ...,      0,      0,      0],
        ...,
        [193258,   8834,     70, ...,      0,      0,      0],
        [ 78842,    589,      1, ...,      0,      0,      0],
        [   609,      8,    622, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

### 3) Sequential Model

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [15]:
model = Sequential()

model.add(Embedding(Max_Features+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [16]:
model.compile(loss='BinaryCrossentropy', optimizer = 'Adam')

In [None]:
history = model.fit(train, epochs = 10, validation_data = val)

Epoch 1/10
 654/9974 [>.............................] - ETA: 42:25 - loss: 0.1103

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (8,5))
pd.DataFrame(history.history).plot()
plt.show()

### 4) Evaluation of Model 

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
presicion = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    predict = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    predict = predict.flatten()
    
    pre.update_state(y_true, predict)
    re.update_state(y_true, predict)
    acc.update_state(y_true, predict)

In [None]:
print(f'Precision: {presicion.result().numpy()}, Recall:{recall.result().numpy()}, Accuracy:{accuracy.result().numpy()}')

### 5) Test and App

In [None]:
!pip install gradio jinja2

In [None]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
interface.launch(share=True)