In [None]:
!pip install tensorflow pandas matplotlib scikit-learn

In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
 df = pd.read_csv('train.csv')

## Data Prepocessing

<p>
    <b>Text Vectorization:</b> The TextVectorization layer is initialized with a maximum vocabulary size of 200,000 tokens, an output sequence length of 1800, and integer output mode. This prepares the text data by converting words into integer sequences, allowing the model to efficiently process the textual input.
</p>

<p>
    <b>Data Preparation:</b> The adapt() method is called on the vectorizer with the input text data, which learns the vocabulary from the data. The vectorizer is then applied to the input text to produce vectorized representations, which are used to create a TensorFlow dataset.
</p>

<p>
    <b>Dataset Configuration:</b> The dataset is configured for optimal performance by caching it in memory, shuffling the elements to ensure randomness, batching the data into groups of 16 for efficient processing, and prefetching 8 batches to overlap data loading with model training.
</p>

<p>
    <b>Dataset Splits:</b> The dataset is split into training (70%), validation (20%), and testing (10%) sets using the take() and skip() methods. This allows for effective model training, hyperparameter tuning, and evaluation of model performance on unseen data.
</p>

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
X = df['comment_text']
Y = df[df.columns[2:]].values

In [6]:
MAX_FEATURES = 200000

In [7]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')

# This variable sets the maximum number of unique tokens (words) that 
# the TextVectorization layer will consider from the input text data. 
# In this case, it is set to 200,000, meaning that the layer will only retain the 200,000 most frequent words 
# from the training dataset. Any words beyond this limit will be ignored.

In [8]:
vectorizer.adapt(X.values) 

In [9]:
vectorizer("Hi, i am Bishwa")

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([171,   8,  74, ...,   0,   0,   0], dtype=int64)>

In [11]:
vectorized_text = vectorizer(X.values)

vectorized_text = vectorizer(X.values)

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))
dataset = dataset.cache()          # caches the dataset in memory, which can improve performance if the dataset fits in available memory.
dataset = dataset.shuffle(160000)  # shuffles the elements in the dataset using a buffer of size 160,000
dataset = dataset.batch(16)        # groups the dataset elements into batches of size 16
dataset = dataset.prefetch(8)      # sets up prefetching, which overlaps data preprocessing and model execution


# Note:
# 1. Shuffling the dataset helps break any unintended correlations in the data and 
# ensures that the model sees a random mix of examples during training.

# 2. Batching the data allows the model to process multiple examples at once, improving computational efficiency.

In [13]:
# Creating the training, validating and testing dataset

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

## Creating the Sequential Model

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [15]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [16]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [17]:
model.summary()

In [18]:
history = model.fit(train, epochs=1, validation_data=val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3878s[0m 555ms/step - loss: 0.0866 - val_loss: 0.0464


## Make Predictions

In [19]:
input_text = vectorizer('You are such an idiot! I am going to hit you.')

In [20]:
input_text = input_text.numpy().reshape(1, 1800)
res = model.predict(input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step


In [21]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [22]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [23]:
(model.predict(batch_X) > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [24]:
res.shape

(1, 6)

## Evaluate the model

In [25]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [26]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [28]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8247238993644714, Recall:0.6677242517471313, Accuracy:0.4653961956501007


## Testing and Gradio App

In [None]:
!pip install gradio jinja2

In [30]:
!pip install markupsafe==2.0.1



In [31]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [34]:
input_str = vectorizer('hey i freaken hate you!')

In [35]:
res = model.predict(np.expand_dims(input_str,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 264ms/step


In [36]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [38]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
                         outputs='text')

In [40]:
interface.launch()

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----

To create a public link, set `share=True` in `launch()`.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
