### Import libraries

In [1]:
import re

import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split

import numpy as np

np.random.seed(42) # NEVER change this line

tf.keras.mixed_precision.set_global_policy('mixed_float16')

2025-11-12 21:27:26.466266: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-12 21:27:26.529904: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-12 21:27:28.111525: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
print("TensorFlow built with CUDA:", tf.test.is_built_with_cuda())
print("GPUs detected:", tf.config.list_physical_devices('GPU'))

TensorFlow built with CUDA: True
GPUs detected: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Load Data

In [None]:
# import data
train_df = pd.read_csv('train.csv', engine='python')

# create function to clearn the input text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Get the 20000 most common words to tokenize
MAX_WORDS = 20000
# For each comment, we want the length to be 200. If it is more, it will be cut short, if it is less, it will be padded
MAX_LEN = 200

# create Keras tokenizer and set num_words parameter
tokenizer = Tokenizer(num_words=MAX_WORDS)
# toeknize 20000 most common words (remove symbols and excess spaces)
tokenizer.fit_on_texts(train_df['comment_text'].apply(clean_text))

# convert each comment to a sequence of 200 numeric word ID's
X_data = pad_sequences(tokenizer.texts_to_sequences(train_df['comment_text']), maxlen=MAX_LEN)
# select the label columns for our y_train ds and convert to numpy matrix where each row corresponds to a single comments labels
y_data = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values



In [118]:
# using iterative stratification, which balances the label combinations across splits.
X_train, y_train, X_test, y_test = iterative_train_test_split(X_data, y_data, test_size=0.2)

print(len(X_train))
print(len(X_test))

178839
44710


In [119]:
# using iterative stratification, which balances the label combinations across splits.
X_train, y_train, X_val, y_val = iterative_train_test_split(X_train, y_train, test_size=0.25)

print(len(X_train))
print(len(X_val))

134129
44710


In [120]:
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')
y_test = y_test.astype('float32')

In [121]:
labels = train_df.drop(columns=['id','comment_text']).columns.to_list()
print(f'labels: {labels}')

class_totals = np.sum(y_train, axis=0)
print(f'class totals: {class_totals}')

total_samples = y_train.shape[0]
print(f'total samples: {total_samples}')

class_weights = {i: total_samples / (len(labels) * class_totals[i]) for i in range(len(labels))}
print("Class weights:", class_weights)

labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
class totals: [12830.  1177.  7284.   413.  6782.  1259.]
total samples: 134129
Class weights: {0: 1.7423876331514678, 1: 18.993061455678276, 2: 3.069032582829947, 3: 54.127925746569815, 4: 3.2962007274157084, 5: 17.75602329891448}


In [122]:
sample_weights = np.ones_like(y_train, dtype='float32')
for i in range(len(labels)):
    sample_weights[:, i] = y_train[:, i] * class_weights[i]

# print(sample_weights)

# If sample_weights has shape (num_samples, 6)
sample_weights_flat = np.mean(sample_weights, axis=1).astype('float32')


In [129]:
def build_model():
    model = Sequential([
        Embedding(input_dim=MAX_WORDS + 1, output_dim=128, input_length=MAX_LEN),
        LSTM(128, recurrent_activation='sigmoid', use_bias=True),
        Dropout(0.2),
        Dense(len(labels), activation='sigmoid', dtype='float32')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']  # list or dict matching output names
    )
    
    return model

In [124]:
def build_callbacks():
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)
    ]

    return callbacks

In [125]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train, sample_weights_flat)) \
    .shuffle(10000).batch(128).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)) \
    .batch(128).prefetch(tf.data.AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)) \
    .batch(128).prefetch(tf.data.AUTOTUNE)

In [126]:
print("y_train shape:", y_train.shape)
print("sample_weights shape:", sample_weights.shape)


y_train shape: (134129, 6)
sample_weights shape: (134129, 6)


In [128]:
for x, y, w in train_ds.take(1):
    print("Batch shapes:", x.shape, y.shape, w.shape)


Batch shapes: (128, 200) (128, 6) (128,)


In [130]:
model = build_model()
callbacks = build_callbacks()

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=callbacks
)

Epoch 1/10




[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.9828 - loss: 0.0736 - val_accuracy: 0.9782 - val_loss: 0.8054 - learning_rate: 0.0010
Epoch 2/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.9753 - loss: 0.0550 - val_accuracy: 0.9890 - val_loss: 0.6592 - learning_rate: 0.0010
Epoch 3/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.9776 - loss: 0.0462 - val_accuracy: 0.9743 - val_loss: 0.5324 - learning_rate: 0.0010
Epoch 4/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.9584 - loss: 0.0399 - val_accuracy: 0.9839 - val_loss: 0.6455 - learning_rate: 0.0010
Epoch 5/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.9499 - loss: 0.0340 - val_accuracy: 0.9709 - val_loss: 0.6343 - learning_rate: 0.0010
Epoch 6/10
[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [131]:
model.save("model.keras")

In [132]:
from tensorflow import keras
imported_model = keras.models.load_model("model.keras")

test_accuracy, test_loss = imported_model.evaluate(test_ds)

[1m350/350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9720 - loss: 0.5443


In [151]:
sentence = "You are so annoying. I hope I never see a comment from you again"
# Clean
cleaned = clean_text(sentence)

# Convert to sequence
seq = tokenizer.texts_to_sequences([cleaned])  # note the list

# Pad sequence
padded_seq = pad_sequences(seq, maxlen=MAX_LEN)

# Predict
prediction = imported_model.predict(padded_seq)
print(prediction)

threshold = 0.5
predicted_labels = [labels[i] for i, p in enumerate(prediction[0]) if p > threshold]
print(predicted_labels)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[[0.8790848  0.03531715 0.2806723  0.3012795  0.2905284  0.03107408]]
['toxic']


In [134]:
from sklearn.metrics import classification_report

# Predict on validation/test set
y_pred = (imported_model.predict(X_test) > 0.5).astype(int)

print(classification_report(y_test, y_pred, target_names=labels))

[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step
               precision    recall  f1-score   support

        toxic       0.10      1.00      0.17      4277
 severe_toxic       0.38      0.38      0.38       392
      obscene       0.27      0.87      0.41      2428
       threat       0.12      0.66      0.21       138
       insult       0.20      0.86      0.32      2261
identity_hate       0.09      0.66      0.16       438

    micro avg       0.13      0.89      0.23      9934
    macro avg       0.19      0.74      0.28      9934
 weighted avg       0.17      0.89      0.27      9934
  samples avg       0.08      0.09      0.08      9934



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
