# Comment Toxicity Model Training
This notebook trains a model to classify toxic comments using the Jigsaw dataset.

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Comment Toxicity Model Training\n",
    "This notebook trains a model to classify toxic comments using the Jigsaw dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Dropout\n",
    "from tensorflow.keras.models import Sequential\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# Set random seed for reproducibility\n",
    "tf.random.set_seed(42)\n",
    "np.random.seed(42)\n",
    "\n",
    "# Define base directory (relative to the project root)\n",
    "BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))\n",
    "\n",
    "# Load dataset\n",
    "df = pd.read_csv(os.path.join(BASE_DIR, 'jigsaw-toxic-comment-classification-challenge', 'train.csv'))\n",
    "print(df.head())\n",
    "\n",
    "# Define categories\n",
    "categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
    "\n",
    "# Prepare data\n",
    "X = df['comment_text'].values\n",
    "y = df[categories].values\n",
    "\n",
    "# Text vectorization\n",
    "MAX_FEATURES = 200000\n",
    "vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')\n",
    "vectorizer.adapt(X)\n",
    "\n",
    "# Build model\n",
    "model = Sequential([\n",
    "    Embedding(MAX_FEATURES + 1, 32),\n",
    "    LSTM(32, return_sequences=False),\n",
    "    Dense(128, activation='relu'),\n",
    "    Dropout(0.2),\n",
    "    Dense(len(categories), activation='sigmoid')\n",
    "])\n",
    "\n",
    "# Compile model\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "\n",
    "# Vectorize input data\n",
    "X_vectorized = vectorizer(X)\n",
    "\n",
    "# Train model\n",
    "model.fit(X_vectorized, y, batch_size=32, epochs=5, validation_split=0.2)\n",
    "\n",
    "# Save model\n",
    "model.save(os.path.join(BASE_DIR, 'toxicity.h5'))\n",
    "\n",
    "# Test a sample comment\n",
    "sample_comment = 'You’re an idiot who doesn’t know anything.'\n",
    "sample_vectorized = vectorizer([sample_comment])\n",
    "prediction = model.predict(sample_vectorized)\n",
    "print({category: bool(prediction[0][idx] > 0.5) for idx, category in enumerate(categories)})\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}