In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers

# Load Data

In [2]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
X = df.comment_text
y = np.array(df.drop(['id','comment_text'], axis=1))

# Tokenization

In [5]:
MAX_FEATURES = 300000

In [6]:
vectorizer = layers.TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=2000,
                               output_mode='int')

2022-10-02 20:14:29.623708: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-02 20:14:29.720645: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-02 20:14:29.721380: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-02 20:14:29.722990: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [7]:
vectorizer.adapt(X.values)

2022-10-02 20:14:32.496755: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [8]:
len(vectorizer.get_vocabulary())

257825

In [10]:
vectorized_text = vectorizer(X.values)

2022-10-02 20:16:10.104201: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2553136000 exceeds 10% of free system memory.


# Model

In [11]:
model = models.Sequential([
    layers.Embedding(MAX_FEATURES+1, 32),
    layers.Bidirectional(layers.LSTM(32, activation='tanh')),
    layers.Dense(128, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(6, activation='sigmoid')   
])

In [12]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy']
             )

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          9600032   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                16640     
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 774       
Total params: 9,691,686
Trainable params: 9,691,686
Non-trainable params: 0
______________________________________________

In [15]:
result = model.fit(vectorized_text, y, epochs=5, batch_size=2000, validation_split=0.2)

2022-10-02 20:16:33.806317: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2042496000 exceeds 10% of free system memory.


Epoch 1/5


2022-10-02 20:16:38.808663: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Predict

In [30]:
X_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
y_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [31]:
X_test = X_test.comment_text
y_test = np.array(y_test.drop('id', axis=1))

In [34]:
vectorized_text_test = vectorizer(X_test.values)

2022-10-02 20:42:50.632434: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2450624000 exceeds 10% of free system memory.


In [37]:
preds = model.predict(vectorized_text_test)

In [43]:
y_pred = []
for pred in preds:
    labels = []
    for label in pred:
        if label < 0.5:
            labels.append(0)
        else:
            labels.append(1)
    y_pred.append(labels)

# Input Prediction

In [83]:
input_text = vectorizer('You suck go to hell')

In [84]:
res = model.predict(np.expand_dims(input_text, 0))
prediction = (res > 0.5).astype(int)

In [85]:
pd.DataFrame(prediction, columns=df.columns[2:])

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,0,1,0,1,0
