In [1]:
import torch
torch.cuda.empty_cache()


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # WordNet dependencies


[nltk_data] Downloading package punkt to C:\Users\Angelika
[nltk_data]     Vergara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Angelika
[nltk_data]     Vergara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Angelika
[nltk_data]     Vergara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Angelika
[nltk_data]     Vergara\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
import nltk
nltk.data.path.append("C:/Users/Angelika Vergara/AppData/Roaming/nltk_data")
nltk.download('punkt', download_dir="C:/Users/Angelika Vergara/AppData/Roaming/nltk_data")


[nltk_data] Downloading package punkt to C:/Users/Angelika
[nltk_data]     Vergara/AppData/Roaming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import nltk
print(nltk.data.path)  # Check where NLTK is looking for data


['C:\\Users\\Angelika Vergara/nltk_data', 'c:\\Users\\Angelika Vergara\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data', 'c:\\Users\\Angelika Vergara\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data', 'c:\\Users\\Angelika Vergara\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data', 'C:\\Users\\Angelika Vergara\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [4]:
import nltk
nltk.data.path.append("C:/Users/Angelika Vergara/AppData/Roaming/nltk_data")


In [5]:
%pip install spacy
!python -m spacy download en_core_web_sm


Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 364.6 kB/s eta 0:00:34
     -- ----------------------

In [5]:
from tensorflow.keras.layers import Bidirectional


In [None]:
import numpy as np
import pandas as pd
import re
import spacy
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Load SpaCy model (en_core_web_md has 300d vectors)
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

def load_and_balance_datasets():
    kaggle_df = pd.read_csv('./data/train.csv')
    kaggle_df['toxic'] = kaggle_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
    kaggle_df = kaggle_df[['comment_text', 'toxic']]

    davidson_df = pd.read_csv('./data/labeled_data.csv')
    davidson_df['toxic'] = (davidson_df['class'] != 2).astype(int)
    davidson_df = davidson_df[['tweet', 'toxic']]
    davidson_df = davidson_df.rename(columns={'tweet': 'comment_text'})

    combined_df = pd.concat([kaggle_df, davidson_df], ignore_index=True)

    df_majority = combined_df[combined_df.toxic == 0]
    df_minority = combined_df[combined_df.toxic == 1]
    df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
    balanced_df = pd.concat([df_majority_downsampled, df_minority]).reset_index(drop=True)
    return balanced_df

def preprocess_texts(texts):
    processed_texts = []
    for doc in nlp.pipe(texts, batch_size=500):
        processed_texts.append(" ".join([token.lemma_ for token in doc if not token.is_stop]))
    return processed_texts

def prepare_model_input(df):
    df['processed_text'] = preprocess_texts(df['comment_text'].tolist())
    unique_words = set(word for text in df['processed_text'] for word in text.split())
    tokenizer = {word: i+1 for i, word in enumerate(unique_words)}
    sequences = [[tokenizer.get(word, 0) for word in text.split()] for text in df['processed_text']]
    padded_sequences = pad_sequences(sequences, maxlen=128, padding='post', truncating='post')
    vocab_size = len(tokenizer) + 1
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in tokenizer.items():
        embedding_matrix[i] = nlp(word).vector if word in nlp.vocab else np.zeros(300)
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['toxic'], test_size=0.1, stratify=df['toxic'], random_state=42)
    return X_train, X_test, y_train, y_test, tokenizer, embedding_matrix, vocab_size

def build_model(vocab_size, embed_dim=300, embedding_matrix=None):
    model = Sequential([
        Embedding(vocab_size, embed_dim, input_length=128, weights=[embedding_matrix] if embedding_matrix is not None else None, trainable=False),
        SpatialDropout1D(0.1),
        Bidirectional(LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
        Bidirectional(LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def main():
    df = load_and_balance_datasets()
    X_train, X_test, y_train, y_test, tokenizer, embedding_matrix, vocab_size = prepare_model_input(df)
    model = build_model(vocab_size, embed_dim=300, embedding_matrix=embedding_matrix)
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=2)
    class_weights = {0: 1.0, 1: 1.2}
    history = model.fit(
        X_train, y_train,
        validation_split=0.1,
        epochs=10,
        batch_size=32,
        verbose=1,
        callbacks=[early_stopping, lr_scheduler],
        class_weight=class_weights
    )
    model.save("improved_model.h5")
    with open("tokenizer.pkl", "wb") as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype("int32")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-Toxic', 'Toxic']))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

if __name__ == "__main__":
    main()


No CUDA-enabled GPU available. Training will use CPU.


KeyboardInterrupt: 

In [1]:
def plot_training_results(history):
    """
    Plot training and validation accuracy and loss from the training history.
    """
    plt.figure(figsize=(14, 5))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training vs Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training vs Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


def plot_confusion(y_true, y_pred):
    """
    Plot confusion matrix for predicted vs actual labels.
    """
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Toxic', 'Toxic'], yticklabels=['Non-Toxic', 'Toxic'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()


In [4]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4160827510970739632
xla_global_id: -1
]
