<a href="https://colab.research.google.com/github/florianreyes/toxicity_analysis/blob/main/toxicity_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
df = pd.read_csv('/content/train.csv')

In [4]:
df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


### 1) Pre-processing the data

In [5]:
from tensorflow.keras.layers import TextVectorization

##### Separate X and y data

In [6]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [7]:
MAX_FEATURES = 20000

In [8]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,output_sequence_length = 1800, output_mode = "int")

##### Adapt the vectorizer to our data

In [9]:
vectorizer.adapt(X.values)

##### Create the matrix with the tokenized data
##### Shape (observations x output_sequence_length)

In [10]:
vectorized_text = vectorizer(X.values)

In [11]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[ 645,   76,    2, ...,    0,    0,    0],
       [   1,   54, 2489, ...,    0,    0,    0],
       [ 425,  441,   70, ...,    0,    0,    0],
       ...,
       [   1, 7392,  383, ...,    0,    0,    0],
       [   5,   12,  534, ...,    0,    0,    0],
       [   5,    8,  130, ...,    0,    0,    0]])>

#### Create data pipeline

In [12]:
# Map - Cache - Shuffle - Batch - Prefetch

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [13]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [14]:
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

# 2) Create the models and Layers

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout ,Dense, Embedding

In [16]:
model = Sequential()

#Embedding layer
model.add(Embedding(MAX_FEATURES + 1, 32 ))

#Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation = 'tanh')))

#Dense fully connected layers feature extractors
model.add(Dense(128,activation = 'relu'))
model.add(Dense(256,activation = 'relu'))
model.add(Dense(128,activation = 'relu'))

#Final layer
model.add(Dense(6, activation = 'sigmoid'))

In [17]:
model.compile(loss = 'BinaryCrossentropy', optimizer = 'Adam')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          640032    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [19]:
history = model.fit(x = train, epochs = 1, validation_data = val)



## 3 Lets try the model

In [30]:
txt_input = vectorizer('You suck!')

In [31]:
prediction1 = model.predict(np.expand_dims(txt_input,0))
prediction1



array([[0.972644  , 0.08907304, 0.7446986 , 0.02228453, 0.6200376 ,
        0.10477202]], dtype=float32)

## 4 Evaluating the model


In [32]:
from tensorflow.keras.metrics import CategoricalAccuracy, Recall, Precision

In [36]:
prec = Precision()
acc = CategoricalAccuracy()
rec = Recall()

In [37]:
for batch in test.as_numpy_iterator():

  #Unpack the batch comprized by text and labels
  batch_X, batch_y = batch

  y_hat = model.predict(batch_X)

  y_hat = y_hat.flatten()
  y_true = batch_y.flatten()

  prec.update_state(y_true, y_hat)
  acc.update_state(y_true, y_hat)
  rec.update_state(y_true, y_hat)




In [38]:
print(f'Precision: {prec.result().numpy()}, Accuracy: {acc.result().numpy()}, Recall: {rec.result().numpy()}')

Precision: 0.821092963218689, Accuracy: 0.5025075078010559, Recall: 0.6716970205307007
