In [8]:
!pip install tensorflow pandas matplotlib scikit-learn

In [12]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv'))

In [None]:
df.head()

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
X = df['comment_text']
Y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000 # number of words in the vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer.get_vocabulary()

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text # tokenized comments

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [None]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()

In [None]:
batch_X.shape

In [None]:
batch_Y.shape

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
# Create embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Create Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid')) # outputs between 0 and 1

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
history = model.fit(train, epochs=10, validation_data=val) # epochs should be increased to 10

In [None]:
model.summary()

In [None]:
from matplotlib import pyplot as plt

In [None]:
history.history

In [None]:
input_text = vectorizer('You freaking suck! Im going to kill you!')

In [None]:
res = model.predict(np.expand_dims(input_text,0))
res

In [167]:
from tensorflow.keras.metrics import AUC, Precision, Recall, CategoricalAccuracy

In [169]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
auc = AUC()

In [171]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions -> convert into one big array instead of array of arrays
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
    auc.update_state(y_true, yhat)

In [173]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, AUC:{auc.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
model.save('toxicity.keras')

In [9]:
model = tf.keras.models.load_model('toxicity.keras')

In [222]:
import pickle

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)