# Text Mining for Toxic Comments: Topic Modeling on Identity Hate comments

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.4-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.4-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.4


In [None]:
# General
import gdown
import zipfile
import os
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import html

# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('wordnet')

from gensim.corpora import Dictionary
from gensim.models import LdaModel, LdaMulticore
from gensim.models import CoherenceModel

from pprint import pprint
from sklearn.metrics.pairwise import cosine_similarity

# BERTopic
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
# Fix seeds for LDA
np.random.seed(42)

In [None]:
url = 'https://drive.google.com/uc?id=1GV_kAxXAd2lQgwUUQwuRf4CL0MO1_hfR'
# If file not exists, make dir
if not os.path.exists('/content/data'):
    os.mkdir('/content/data')
output = 'jigsaw-toxic-comment-classification-challenge.zip'
gdown.download(url, output, quiet=False)

zip_file = zipfile.ZipFile('jigsaw-toxic-comment-classification-challenge.zip')
zip_file.extractall('data/')
zip_file.close()
os.remove('jigsaw-toxic-comment-classification-challenge.zip')

# Extract all files with .zip in the content folder
for file in os.listdir('/content/data'):
  if file.endswith('.zip'):
    zip_file = zipfile.ZipFile('/content/data/' + file)
    zip_file.extractall('/content/data/')
    zip_file.close()
    os.remove('/content/data/' + file)

Downloading...
From (original): https://drive.google.com/uc?id=1GV_kAxXAd2lQgwUUQwuRf4CL0MO1_hfR
From (redirected): https://drive.google.com/uc?id=1GV_kAxXAd2lQgwUUQwuRf4CL0MO1_hfR&confirm=t&uuid=8360612a-b24d-47b2-adeb-521ed2215978
To: /content/jigsaw-toxic-comment-classification-challenge.zip
100%|██████████| 55.2M/55.2M [00:00<00:00, 67.4MB/s]


# Preprocessing

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')['comment_text']
y_test = pd.read_csv('data/test_labels.csv').drop(columns = 'id')

In [None]:
identity_hate_comments_index = train[train["identity_hate"] == 1].index
train = train.iloc[identity_hate_comments_index]['comment_text']

identity_hate_comments_index = y_test[y_test["identity_hate"] == 1].index
test = test.iloc[identity_hate_comments_index].reset_index(drop = True)

In [None]:
# Concatenating datasets
full = pd.concat([train, test], ignore_index=True)

In [None]:
print(f"Number of \"Identity Hate\" comments: {full.shape[0]}")

Number of "Identity Hate" comments: 2117


# Bertopic

### Preprocessor

In [None]:
def bertopic_preprocessor(text):
    # Decode HTML entities ("&amp;" -> "&", "&quot;" -> '"')
    text = html.unescape(text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Mask URLs with 'http'
    text = re.sub(r'http\S+|www\S+', 'http', text)

    # Remove IP Addresses
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

    # Mask tag/mention with 'user'
    text = re.sub(r'@\w+', 'user', text)

    # Remove Character Repetitions
    pattern_alpha = re.compile(r"([A-Za-z])\1{2,}", re.DOTALL)
    text = pattern_alpha.sub(r"\1\1", text)

    # Final cleanup which removes Double Whitespaces, Newlines, Tabs
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
full_bt = full.apply(bertopic_preprocessor)

## Quantitative Evaluation
We'll define a function we'll use to evaluate our topic models:

In [None]:
# Tokenize documents (We need this for the Gensim Dictionary)
def simple_tokenize(sentences):
    return [sent.split() for sent in sentences]

# Use the same docs fed into BERTopic
tokenized_docs = simple_tokenize(full_bt)

# Create Gensim Dictionary
id2word = Dictionary(tokenized_docs)

# Evaluation Function
def evaluate_bertopic(topic_model, tokenized_docs, dictionary):
    # Prepare Topics
    topic_info = topic_model.get_topics()
    topics_list = []

    # Extract words for Coherence/Lexical Diversity
    for topic_id, words_probs in topic_info.items():
        if topic_id != -1:  # Ignore outliers
            words = [word for word, prob in words_probs]
            topics_list.append(words)

    if not topics_list:
        print("Error: No topics found (all data might be outliers).")
        return

    # Calculate Coherence (C_v)
    print("Calculating Coherence...")
    cm = CoherenceModel(topics=topics_list,
                        texts=tokenized_docs,
                        dictionary=dictionary,
                        coherence='c_v')
    coherence_cv = cm.get_coherence()

    # Calculate Semantic Diversity (Embedding Distance)
    print("Calculating Semantic Diversity...")

    # Get Topic Info and align with Embeddings
    info_df = topic_model.get_topic_info()

    # Filter for valid topics (Topic != -1)
    valid_mask = info_df['Topic'] != -1
    valid_indices = info_df[valid_mask].index

    if len(valid_indices) > 1:
        # Retrieve embeddings for valid topics only
        valid_embeddings = np.array(topic_model.topic_embeddings_)[valid_indices]

        # Compute Cosine Similarity
        sim_matrix = cosine_similarity(valid_embeddings)

        # Average Similarity (ignoring diagonal self-similarity)
        np.fill_diagonal(sim_matrix, np.nan)
        avg_similarity = np.nanmean(sim_matrix)

        semantic_diversity = 1 - avg_similarity
    else:
        semantic_diversity = 0.0
        print("Warning: Not enough topics to calculate semantic diversity.")

    # Print Results
    print(f"\n--- BERTopic Evaluation ---")
    print(f"Coherence (C_v):       {coherence_cv:.4f}  (Higher is better)")
    print(f"Semantic Diversity:    {semantic_diversity:.4f}  (Higher is better)")

In [None]:
# KeyBERT + MMR for Topic Fine-Tuning
representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3),
    "Main": KeyBERTInspired()
}

## Clustering with HDBSCAN

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(full_bt, show_progress_bar=True)

Batches:   0%|          | 0/67 [00:00<?, ?it/s]

In [None]:
# Vectorizer to remove stopwords
vectorizer_model = CountVectorizer(stop_words="english", min_df=10, ngram_range=(1, 2))
custom_stopwords = list(CountVectorizer(stop_words="english").get_stop_words())
custom_stopwords.extend(["don", "just", "like", "know", "people", "think", "did", "going"])

vectorizer_model = CountVectorizer(
    stop_words=custom_stopwords,
    min_df=1,
    ngram_range=(1, 2),
    lowercase = False
)

# UMAP for Dimensionality Reduction
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# HDBSCAN for Clustering
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples = 10,  metric='euclidean', cluster_selection_method='eom')

# Model Initialization
topic_model = BERTopic(
    embedding_model = sentence_model,
    nr_topics=None,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
)

topics, probs = topic_model.fit_transform(full_bt, embeddings = embeddings)

In [None]:
topic_model.update_topics(full_bt, representation_model = representation_model)

In [None]:
evaluate_bertopic(topic_model, tokenized_docs, id2word)

Calculating Coherence...
Calculating Semantic Diversity...

--- BERTopic Evaluation ---
Coherence (C_v):       0.3399  (Higher is better)
Semantic Diversity:    0.4352  (Higher is better)


In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,715,-1_faggotgay_faggot_ancestryfuck_stop,"[faggotgay, faggot, ancestryfuck, stop, lololo...","[faggotgay, faggot, ancestryfuck, stop, lololo...","[fat, jew, faggot, fuck, gay, mexicans, fags, ...","[GAY FAGS SPREAD AIDS AND FUCK LITTLE KIDS, ST..."
1,0,377,0_jews_jewish_jew_nazis,"[jews, jewish, jew, nazis, nazi, muslims, isra...","[jews, jewish, jew, nazis, nazi, muslims, isra...","[the, to, jews, be, your, we, on, who, will, i...",[FAT INDIAN MAN! RAPPING FOR YOU POO IN YUR MO...
2,1,360,1_nigger_niggerjew_niggers_nigga,"[nigger, niggerjew, niggers, nigga, niggas, li...","[nigger, niggerjew, niggers, nigga, niggas, li...","[nigger, nigga, cunt, niggerjew, niggas, tommy...",[NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIG...
3,2,281,2_gay_homosexual_gaythis_homo,"[gay, homosexual, gaythis, homo, queer, gays, ...","[gay, homosexual, gaythis, homo, queer, gays, ...","[gay, bunksteve, france, cline, dick, john, gi...","[you are gay gay gay gay gay gay, gay gay gay ..."
4,3,165,3_fag_fagget_faggot_faggots,"[fag, fagget, faggot, faggots, fagot, fuck, bi...","[fag, fagget, faggot, faggots, fagot, fuck, bi...","[fag, faggot, him, oscarthecat, kill, faggots,...",[Oscarthecat is a fucking faggot who will DIE....
5,4,113,4_gayest_homosexual_gay_homosexuality,"[gayest, homosexual, gay, homosexuality, sexua...","[gayest, homosexual, gay, homosexuality, sexua...","[gay, homosexual, his, sex, brian, likes, ryan...",[So Kashten is a homo and he likes men... Ther...
6,5,106,5_blocked_block_blocking_stop,"[blocked, block, blocking, stop, shithead, fag...","[blocked, block, blocking, stop, shithead, fag...","[to, your, block, wikipedia, it, here, informa...","[""HEY VACUOS, EMPTY- MINDED COLA! GET A LIFE! ..."


In [None]:
topic_model.visualize_documents(full_bt, embeddings = embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics = 10, n_words = 10, width = 300, height = 300)

## Clustering with KMeans

### 10 Clusters

In [None]:
# Vectorizer to remove stopwords
custom_stopwords = list(CountVectorizer(stop_words="english").get_stop_words())
custom_stopwords.extend(["don", "just", "like", "know", "people", "think", "did", "going"])

vectorizer_model = CountVectorizer(
    stop_words=custom_stopwords,
    ngram_range=(1, 2),
    lowercase = False
)

# UMAP for Dimensionality Reduction
umap_model = UMAP(n_neighbors=50, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# HDBSCAN for Clustering
hdbscan_model = KMeans(n_clusters = 10, random_state = 42)

# Model Initialization
topic_model = BERTopic(
    embedding_model = sentence_model,
    nr_topics='auto',
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_modelì
)

topics, probs = topic_model.fit_transform(full_bt, embeddings = embeddings)

In [None]:
topic_model.update_topics(full_bt, representation_model = representation_model)

In [None]:
evaluate_bertopic(topic_model, tokenized_docs, id2word)

Calculating Coherence...
Calculating Semantic Diversity...

--- BERTopic Evaluation ---
Coherence (C_v):       0.3778  (Higher is better)
Semantic Diversity:    0.3913  (Higher is better)


In [None]:
topic_model.visualize_documents(full_bt, embeddings = embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics = 10, n_words = 10, width = 300, height = 300)

### 5 Clusters

In [None]:
# Vectorizer to remove stopwords
custom_stopwords = list(CountVectorizer(stop_words="english").get_stop_words())
custom_stopwords.extend(["don", "just", "like", "know", "people", "think", "did", "going"])

vectorizer_model = CountVectorizer(
    stop_words=custom_stopwords,
    ngram_range=(1, 2),
    lowercase = False
)

# UMAP for Dimensionality Reduction
umap_model = UMAP(n_neighbors=50, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# HDBSCAN for Clustering
hdbscan_model = KMeans(n_clusters = 5, random_state = 42)

# Model Initialization
topic_model = BERTopic(
    embedding_model = sentence_model,
    nr_topics='auto',
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
)

topics, probs = topic_model.fit_transform(full_bt, embeddings = embeddings)

In [None]:
topic_model.update_topics(full_bt, representation_model = representation_model)

In [None]:
evaluate_bertopic(topic_model, tokenized_docs, id2word)

Calculating Coherence...
Calculating Semantic Diversity...

--- BERTopic Evaluation ---
Coherence (C_v):       0.3670  (Higher is better)
Semantic Diversity:    0.3341  (Higher is better)


In [None]:
topic_model.visualize_documents(full_bt, embeddings = embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics = 10, n_words = 10, width = 300, height = 300)