In [1]:
import os 
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras

from keras.layers import TextVectorization, Embedding, LSTM, Bidirectional, Dense
from keras.models import Sequential
from keras.metrics import Precision, Recall

2024-01-13 13:44:21.161182: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv(os.path.join('csvFiles','train.csv','train.csv'))
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,racist
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Preprocessing Data

In [5]:
X = df['comment_text'] #all the act text
y = df[df.columns[2:]].values #the lavels of said text (toxic, super toxic, etc)

In [3]:
max_features = 200000  # number of words in the vacab,  
max_len = 1800  # Sequence length to pad the outputs to.

vectorizer = TextVectorization(max_tokens = max_features, output_sequence_length = max_len,output_mode = 'int')
# this means that it is going to map every single word to an integer value

In [6]:
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)
vectorized_text


<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)  # how large our buffer size needs to be
dataset = dataset.batch(16)  # each batch represented as a series of 16 samples
dataset = dataset.prefetch(8)  # helps prevent bottlenecks

Data Splitting

In [7]:
train = dataset.take(int(len(dataset)*.9))   # 90%
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.05))   # 5%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.05))  # 5%

In [8]:
len(train), len(val), len(test)

(8976, 498, 498)

Model

In [9]:
model = Sequential()

# Create the embedding layer 
model.add(Embedding(max_features+1, 32))

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Final layer
model.add(Dense(6, activation='sigmoid'))

In [10]:
model.compile(optimizer='Adam', loss='BinaryCrossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

Training

In [11]:
hist = model.fit(train, epochs=3, validation_data=val, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
loss, Acc = model.evaluate(test)

print('Test Loss: ', loss)
print('Test Accuracy: ', Acc)

Test Loss:  0.031640127301216125
Test Accuracy:  0.9929718971252441


Make Predictions

In [7]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

res = model.predict(np.expand_dims(input_text, axis=0))

NameError: name 'model' is not defined

In [14]:
(res > 0.5).astype(int)
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [15]:
pre = Precision()
re = Recall()

for batch in test.as_numpy_iterator():
    # unpack the batch
    X_test, y_test = batch
    # make a prediction
    y_pred = model.predict(X_test)
    
    y_test = y_test.flatten()
    y_pred = y_pred.flatten()
    
    pre.update_state(y_test, y_pred)
    re.update_state(y_test, y_pred)



In [16]:
model.save('toxic.h5')
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}')

  saving_api.save_model(


Precision: 0.8278451561927795, Recall:0.8326554298400879


Gradio

In [8]:
import gradio as gr
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
model = tf.keras.models.load_model('toxic.h5')
input_str = vectorizer('YOU ARE A NIGGER') # input box

In [26]:
res = model.predict(np.expand_dims(input_str, 0))
res



array([[0.9805566 , 0.14262813, 0.6239584 , 0.12638627, 0.7979021 ,
        0.38881087]], dtype=float32)

In [33]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[[2,5,6,7]]): # show important categories
        text += '{}: {}\n'.format(col, results[0][idx] > 0.5)
    
    return text

In [34]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [35]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://fdd891c1b5768d56e5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




