In [None]:
%pip install pandas numpy tensorflow matplotlib

In [3]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
df = pd.read_csv("/Users/maruthi_chdl/Desktop/Comment Toxicity/comment_toxicity/data/train.csv")

In [None]:
df[df.columns[2:]].iloc[6]

## 1. Preprocessing

In this preprocessing step we will be tokenizing the data. We will transalting each word in the sentences as a identifier but mapping into the a number. In order to complete this step I will be using `TextVectorization from Keras Layers`

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [8]:
max_words = 200000 #number of words in the vocab
output_length = 1800

In [9]:
vectorization = TextVectorization(max_tokens = max_words, 
                                  output_sequence_length = output_length,
                                  output_mode = 'int')


In [10]:
vectorization.adapt(x.values)
vectorization.get_vocabulary()
vectorized_text = vectorization(x.values)

### 1.1 Creating a Dataset for the generated vectorized values

In [11]:
#MCSHBAP- Map, Cache, Shuffle, Batch, Prefetech

vector_dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
vector_dataset = vector_dataset.cache()
vector_dataset = vector_dataset.shuffle(160000)
vector_dataset = vector_dataset.batch(16)
vector_dataset = vector_dataset.prefetch(8)

In [12]:
batch_x, batch_y = vector_dataset.as_numpy_iterator().next()

### 1.2 Preparing Datasets for Validation, Testing and Training

In [13]:
train = vector_dataset.take(int(len(vector_dataset)*.7))
val = vector_dataset.skip(int(len(vector_dataset)*.7)).take(int(len(vector_dataset)*.2))
test = val = vector_dataset.skip(int(len(vector_dataset)*.9)).take(int(len(vector_dataset)*.1))

In [None]:
train_generator = train.as_numpy_iterator()
train_generator.next()

## 2. Creating LSTM Sequential Model

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, Dropout

In [None]:
model = Sequential()
#Creating the Embedding Layer
model.add(Embedding(input_dim=max_words+1, output_dim=32, input_length=output_length))
#Creating Bidrectional LSTM 
model.add(Bidirectional(LSTM(32, activation = 'tanh')))
#Feature extraction fully connected layers
model.add(Dense(128, activation ='relu'))
model.add(Dense(256, activation ='relu'))
model.add(Dense(128, activation ='relu'))
#Final layer
model.add(Dense(6, activation ='sigmoid'))


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build((None, 1800)) 
# Print the model summary
model.summary()

In [None]:
history = model.fit(train, epochs= 1, validation_data= val)

In [None]:
history.history

## 3. Making Predictions

In [19]:
batch = test.as_numpy_iterator().next()

In [20]:
batch_x, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_x) > 0.5).astype(int)

In [None]:
res = model.predict(batch_x)

## 4. Saving the Model


In [None]:
model.save('/Users/maruthi_chdl/Desktop/Comment Toxicity/comment_toxicity/models/toxicity.h5')

## 5. Test and Gradio

In [None]:
%pip install gradio jinja2

In [None]:
import gradio as gr

In [28]:
def score_comment(comment):
    vectorized_comment = vectorization([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [34]:
interface = gr.Interface(fn = score_comment, 
                         inputs= gr.Textbox(lines = 2, placeholder ='Comments to Score'),
                         outputs = 'text')

In [None]:
interface.launch()

In [None]:
interface.close()