In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [5]:
df = pd.read_csv('train.csv')

In [6]:
texts = df['comment_text'].values
labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [7]:
vocab_size = 20000 
max_length = 150
embedding_dim = 16
test_size = 0.2
batch_size = 32
epochs = 10

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=test_size, random_state=42)

In [9]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(24, activation='relu'),
    Dense(6, activation='sigmoid')  # Output layer has 6 units for 6 labels
])



In [10]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

Epoch 1/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 38ms/step - accuracy: 0.9273 - loss: 0.1593 - val_accuracy: 0.9941 - val_loss: 0.1427
Epoch 2/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 39ms/step - accuracy: 0.9914 - loss: 0.1384 - val_accuracy: 0.9941 - val_loss: 0.0655
Epoch 3/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 39ms/step - accuracy: 0.9941 - loss: 0.0584 - val_accuracy: 0.9941 - val_loss: 0.0525
Epoch 4/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 39ms/step - accuracy: 0.9943 - loss: 0.0481 - val_accuracy: 0.9941 - val_loss: 0.0512
Epoch 5/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 39ms/step - accuracy: 0.9941 - loss: 0.0439 - val_accuracy: 0.9941 - val_loss: 0.0512
Epoch 6/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 39ms/step - accuracy: 0.9947 - loss: 0.0404 - val_accuracy: 0.9941 - val_loss: 0.051

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9810 - loss: 0.0602
Test Accuracy: 0.98


In [21]:
model.save("model.keras")
# tf.keras.saving.save_model(model, 'my_model.keras')
# model.save('model.keras')

In [24]:
tf.keras.models.load_model("models/model.h5")



<Sequential name=sequential, built=True>