In [49]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [50]:
df = pd.read_csv(
    os.path.join('train.csv')
)
#Insert the data i.e the CSV into the model

In [51]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Apply the preprocessing to the text data
df['comment_text'] = df['comment_text'].astype(str).apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
df.head()
#To see what is inside our csv file but only the first four items

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww matches background colour im seemingly st...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,0001b41b1c6bb37e,cant make real suggestions improvement wondere...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0


In [53]:
from tensorflow.keras.layers import TextVectorization
# To make our input into tokens

In [54]:
X = df['comment_text']
Y = df[df.columns[2:]].values
#extract columns as in a matrix format

In [55]:
# MAX_FEATURES = 200000
#number of words in the vocab

In [56]:
# vectorizer = TextVectorization(max_tokens = MAX_FEATURES,
#                                output_sequence_length =  1800,  #Each sentence that we are taking from the csv, we are going to cap it onto 1800
#                                output_mode = 'int')
from tensorflow.keras.layers import TextVectorization

MAX_FEATURES = 50000  # Reduce vocabulary size to remove noise
vectorizer = TextVectorization(
    max_tokens=MAX_FEATURES, 
    output_sequence_length=512,  # Reduce sequence length
    output_mode='int',
    ngrams=2  # Capture bi-grams for better context
)


In [57]:
vectorizer.adapt(X.values)
#It is gonna learn all the things that are in our voacabulary

In [58]:
vectorized_text = vectorizer(X.values)

In [59]:
# Expand dimensions to match expected LSTM input
# vectorized_text = tf.expand_dims(vectorized_text, -1)

# Prepare the dataset
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(48)  # Updated batch size to 48
dataset = dataset.prefetch(tf.data.AUTOTUNE)


In [60]:
# Ensure input shape is (batch_size, sequence_length, 1)
# train = train.map(lambda x, y: (tf.expand_dims(x, axis=-1), y))
# val = val.map(lambda x, y: (tf.expand_dims(x, axis=-1), y))


In [61]:
for batch_x, batch_y in dataset.take(1):
    print(batch_x.shape, batch_y.shape)

(48, 512) (48, 6)


In [62]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(32)
dataset = dataset.prefetch(8)

In [63]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()
 # This provides us with our texts and columns in a vectorized form
 # Vectorized form means that all the words that we have is in numerical form which lies between 1-1800 cause its the limit that is set
 # like for example happy is given the number as 165
 # the array @ the bottom is our toxicity levels, racism levels and whatever the tokens are meant to be as
 # like if the comment is toxic then the fist index of the matrix will turn into 1 if not then 0

In [64]:
train = dataset.take(int(len(dataset)*.7)) #We are going to use 70% of our dataset to train our model
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*2)) #We are going to use 20% of our dataset to validate our model
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*1)) #We are going to use 10% of our dataset to test our model

In [65]:
train_generator = train.as_numpy_iterator()
# What is gonna happen is that our model will go on to train all of theses as BATCHES
# Once it finishes one batch it will move on to another
# With the help of train_generator.next() it will move onto the next batch

In [66]:
train_generator.next()
# If you do refresh it a new batch will be generated

(array([[  154,     1,    57, ...,     0,     0,     0],
        [  727,   860,  2604, ...,     0,     0,     0],
        [ 2465,  1982,  1214, ...,     0,     0,     0],
        ...,
        [10008,    55,  1763, ...,     0,     0,     0],
        [ 7021,     1,   520, ...,     0,     0,     0],
        [   83,   404,   548, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0,

In [67]:
# from tensorflow.keras.models import Sequential #Sequential is an API model
# from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [68]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

model = Sequential()
model.add(Embedding(input_dim=MAX_FEATURES+1, output_dim=128, input_length=512))  # Add input_length
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # Stacked LSTM
model.add(Bidirectional(LSTM(32)))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Prevent overfitting
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))  # Multi-label classification

model.compile(loss='binary_crossentropy', 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              metrics=['accuracy'])




In [69]:
model.compile(loss = 'BinaryCrossentropy', optimizer = 'Adam')

In [70]:
model.summary()

In [71]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(Y.flatten()), 
    y=Y.flatten()
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Add early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Stop if val_loss doesn't improve after 3 epochs
    restore_best_weights=True  # Restore model to best epoch weights
)

# Train the model with early stopping, class weights, and optimized batch size
history = model.fit(train, 
                    epochs=10,   # Lowered epochs to fit within 6-7 hours
                    batch_size=48,  # Balanced batch size for speed and accuracy
                    validation_data=val, 
                    class_weight=class_weights_dict, 
                    callbacks=[early_stopping])

# history = model.fit(train, epochs=5, validation_data=val)
# Training the data set


Epoch 1/10
[1m   1/3490[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:16:58[0m 14s/step - loss: 0.3589

InvalidArgumentError: Graph execution error:

Detected at node GatherV2 defined at (most recent call last):
<stack traces unavailable>
Error in user-defined function passed to MapDataset:70 transformation with iterator: Iterator::Root::Prefetch::ParallelMapV2: indices[16] = 2 is not in [0, 2)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_9362]

In [22]:
from matplotlib import pyplot as plt

In [23]:
input_text = vectorizer('Fuck Off!')

In [24]:
model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 666ms/step


array([[0.9976609 , 0.31855732, 0.99362105, 0.00482566, 0.74220246,
        0.02615224]], dtype=float32)

In [25]:
batch = test.as_numpy_iterator().next()

In [26]:
df.columns[2:] #to check what is the subject of the sentence

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [27]:
batch = test.as_numpy_iterator().next()

In [28]:
batch_X, batch_Y = test.as_numpy_iterator().next()

In [29]:
(model.predict(batch_X) > 0.5).astype(int)
# If above threshold of 0.5 then commited i.e positive

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 749ms/step


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [30]:
res = model.predict(np.expand_dims(input_text,0)) #make the model to predict

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step


In [31]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy #To evaluate the model

In [32]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
# If you do tend to add more data into the CSV file, since the data has been put into batches, these tend to overhead them
# What I mean to explain is that on addition of new data, new batches will be created, the batches present beforehand will be pushed above so that new data can be filled into these batch
# Ofcourse the data given will be then converted into matrics for prediction

In [33]:
for batch in test.as_numpy_iterator():  #Go through each batch
    # Unpack the batch
    X_true, Y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    Y_true = Y_true.flatten() #make a huge array into a single line array
    yhat = yhat.flatten()

    pre.update_state(Y_true, yhat) #make update to the batches with new data
    re.update_state(Y_true, yhat)
    acc.update_state(Y_true, yhat)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [34]:
print(f'Precision: {pre.result().numpy()},Recall:{re.result().numpy()},Accuracy:{acc.result().numpy()}')

Precision: 0.8282856345176697,Recall:0.8526887893676758,Accuracy:0.36673346161842346


In [35]:
import tensorflow as tf
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
model.save('toxicity.h5')



In [37]:
model = tf.keras.models.load_model('toxicity.h5')



In [38]:
input_str = vectorizer('hey i freaken hate you!')

In [39]:
res = model.predict(np.expand_dims(input_str,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 814ms/step


In [40]:
res > 0.5

array([[ True, False, False, False, False, False]])

In [41]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment]) #convert text into token
    results = model.predict(vectorized_comment)

    # Unpack all the results
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)

    return text

In [42]:
interface = gr.Interface(
    fn=score_comment,
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
    outputs=gr.Textbox()
)

In [44]:
interface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
* Running on public URL: https://8c8f710d5ec9b0b69f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
