In [5]:
pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloa

In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



# Load the dataset
train_data = pd.read_csv("/Users/shacheesb/Downloads/jigsaw-toxic-comment-train.csv")

# Preprocessing
train_data = train_data[['comment_text', 'toxic']]
train_data.dropna(inplace=True)

# Text Tokenization
max_words = 20000  # Vocabulary size
max_sequence_length = 200  # Maximum comment length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['comment_text'])

# Convert texts to sequences
X = tokenizer.texts_to_sequences(train_data['comment_text'])
X = pad_sequences(X, maxlen=max_sequence_length, padding='post')

# Labels
y = train_data['toxic'].values

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model definition
embedding_dim = 128
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=False),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 5
batch_size = 64

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)

# Evaluate the model
y_pred = (model.predict(X_val) > 0.5).astype(int)
print(classification_report(y_val, y_pred))

# Save the tokenizer and model for later use
model.save("lstm_toxicity_model.h5")
with open("tokenizer.json", "w") as f:
    f.write(tokenizer.to_json())

# Example Prediction
def predict_toxicity(comment):
    sequence = tokenizer.texts_to_sequences([comment])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    prediction = model.predict(padded_sequence)[0][0]
    return "Toxic" if prediction > 0.5 else "Non-Toxic"

# Example usage
example_comment = "I hate this so much, you are terrible!"
print(f"Comment: {example_comment}\nPrediction: {predict_toxicity(example_comment)}")


Epoch 1/5




[1m2795/2795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 148ms/step - accuracy: 0.9045 - loss: 0.3242 - val_accuracy: 0.9028 - val_loss: 0.2222
Epoch 2/5
[1m2795/2795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 152ms/step - accuracy: 0.9436 - loss: 0.1509 - val_accuracy: 0.9563 - val_loss: 0.1134
Epoch 3/5
[1m2795/2795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 149ms/step - accuracy: 0.9592 - loss: 0.1032 - val_accuracy: 0.9567 - val_loss: 0.1109
Epoch 4/5
[1m2795/2795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 146ms/step - accuracy: 0.9636 - loss: 0.0910 - val_accuracy: 0.9563 - val_loss: 0.1169
Epoch 5/5
[1m2795/2795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 149ms/step - accuracy: 0.9687 - loss: 0.0778 - val_accuracy: 0.9536 - val_loss: 0.1282
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 33ms/step




              precision    recall  f1-score   support

           0       0.97      0.98      0.97     40444
           1       0.77      0.74      0.75      4266

    accuracy                           0.95     44710
   macro avg       0.87      0.86      0.86     44710
weighted avg       0.95      0.95      0.95     44710

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Comment: I hate this so much, you are terrible!
Prediction: Toxic


ModuleNotFoundError: No module named 'tensorflow'