Installations

In [None]:
!pip install pandas scikit-learn gensim




Packages

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.models import LdaModel

Loading Excel and filtering columns

In [None]:
import pandas as pd

# Load both CSV files
file_path_2021 = '/content/hatetoxic_2021.csv'
file_path_2223 = '/content/hatetoxic_2223.csv'

# Read each file into a DataFrame
df_2021 = pd.read_csv(file_path_2021)
df_2223 = pd.read_csv(file_path_2223)

# Keep only the specified columns
columns_to_keep = [
    'text without punctuation and stopword', 'subreddit_id', 'moderation', 'year', 'month',
    'subreddit_name', 'hate_label', 'toxic_label'
]
df_2021 = df_2021[columns_to_keep]
df_2223 = df_2223[columns_to_keep]

# Combine both DataFrames
df = pd.concat([df_2021, df_2223], ignore_index=True)

# Display the first few rows to verify
print(df.head())


               text without punctuation and stopword subreddit_id  \
0                                 manually suck dust     t5_2qh8c   
1  moral high ground easy realise shame entail le...     t5_2qh8c   
2    assume area botanical garden dinosaur shit gone     t5_2qh8c   
3  want nanny state everything admit mistake shit...     t5_2qh8c   
4  making fun short people exactly singaporean gl...     t5_2qh8c   

                                          moderation  year  month  \
0  {'collapsed_reason_code': None, 'collapsed_rea...  2021      7   
1  {'removal_reason': None, 'collapsed': False, '...  2020      3   
2  {'removal_reason': None, 'collapsed': False, '...  2020     10   
3  {'collapsed_reason': None, 'author_is_blocked'...  2021      5   
4  {'removal_reason': None, 'collapsed': False, '...  2020      1   

  subreddit_name hate_label toxic_label  
0    r/Singapore   NOT-HATE       toxic  
1    r/Singapore       HATE     neutral  
2    r/Singapore   NOT-HATE       toxic  
3 

Filtering by year

In [None]:
# Get the unique years from the filtered DataFrame
years = df['year'].unique()

# Create separate DataFrames for each year
for year in years:
    globals()[f'df_{year}'] = df[df['year'] == year]

print(df_2020.head())

               text without punctuation and stopword subreddit_id  \
1  moral high ground easy realise shame entail le...     t5_2qh8c   
2    assume area botanical garden dinosaur shit gone     t5_2qh8c   
4  making fun short people exactly singaporean gl...     t5_2qh8c   
6  want n scraped study pretty think saf contagio...     t5_2qh8c   
7  hougang likely alr counted result close need r...     t5_2qh8c   

                                          moderation  year  month  \
1  {'removal_reason': None, 'collapsed': False, '...  2020      3   
2  {'removal_reason': None, 'collapsed': False, '...  2020     10   
4  {'removal_reason': None, 'collapsed': False, '...  2020      1   
6  {'removal_reason': None, 'collapsed': False, '...  2020      3   
7  {'removal_reason': None, 'collapsed': False, '...  2020      7   

  subreddit_name hate_label toxic_label  
1    r/Singapore       HATE     neutral  
2    r/Singapore   NOT-HATE       toxic  
4    r/Singapore   NOT-HATE       toxic  
6 

Filtering by year and month

In [None]:
# Create a DataFrame for each month of each year
for year in df['year'].unique():
    for month in df['month'].unique():
        globals()[f'df_{year}_{month}'] = df[(df['year'] == year) & (df['month'] == month)]

print(df_2021_5.head())


                text without punctuation and stopword subreddit_id  \
3   want nanny state everything admit mistake shit...     t5_2qh8c   
9   dickbags everywhere saf allows dickbags rise s...     t5_2qh8c   
14                wound side civet poisoned mutilated     t5_2qh8c   
15                          looking police state nerd     t5_2qh8c   
32  dun believe come help legitimise second statem...     t5_2qh8c   

                                           moderation  year  month  \
3   {'collapsed_reason': None, 'author_is_blocked'...  2021      5   
9   {'collapsed_reason': None, 'author_is_blocked'...  2021      5   
14  {'collapsed_reason': None, 'author_is_blocked'...  2021      5   
15  {'collapsed_reason': None, 'author_is_blocked'...  2021      5   
32  {'collapsed_reason': None, 'author_is_blocked'...  2021      5   

   subreddit_name hate_label toxic_label  
3     r/Singapore   NOT-HATE       toxic  
9     r/Singapore       HATE       toxic  
14    r/Singapore       HATE 

BERTopic

In [None]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re

# Initialize embedding model and set HDBSCAN parameters
sentence_model = SentenceTransformer("all-mpnet-base-v2")
hdbscan_model = HDBSCAN(min_cluster_size=2, cluster_selection_epsilon=0.01)  # Fine-tuned for flexibility

# Initialize BERTopic with `nr_topics` set to "auto" for natural clustering
topic_model = BERTopic(embedding_model=sentence_model, hdbscan_model=hdbscan_model, nr_topics="auto")

# Define custom stopwords
stopwords = {"shit", "fuck", "fucking", "good", "time", "dumb", "singapore", "dumbass", "gong", "people"}

# Dictionary to store results for each subreddit
subreddit_topics = {}

# List of subreddits to process
subreddits = df['subreddit_name'].unique()

# Loop through each subreddit
for subreddit in subreddits:
    # Filter data for the current subreddit
    df_subreddit = df[df['subreddit_name'] == subreddit]
    docs = df_subreddit['text without punctuation and stopword'].tolist()

    # Remove custom stopwords from each document
    processed_docs = []
    for doc in docs:
        # Split text into words, filter out stopwords, and join back into a string
        filtered_words = [word for word in doc.split() if word.lower() not in stopwords]
        processed_doc = " ".join(filtered_words)
        processed_docs.append(processed_doc)

    # Fit the model on the processed documents for the current subreddit
    topics, probabilities = topic_model.fit_transform(processed_docs)

    # Retrieve topic information
    topic_info = topic_model.get_topic_info()
    top_topics = topic_info[topic_info["Topic"] != -1].head(10)  # Exclude outliers and get top topics

    # Initialize list to store keywords for the current subreddit
    keyword_list = []

    # Display top topics and keywords for the current subreddit
    print(f"\nTop Topics and Keywords for {subreddit}:")
    for i, topic in enumerate(top_topics['Topic'].tolist(), start=1):
        topic_words = topic_model.get_topic(topic)
        if topic_words:
            top_words = [word[0] for word in topic_words[:10]]
            keyword_list.append(top_words)  # Save keywords for each topic in keyword_list
            print(f"Topic {i}: {' | '.join(top_words)}")
        else:
            keyword_list.append([])  # Append empty list if no keywords identified
            print(f"Topic {i}: No clear keywords identified")

    # Store the list of keywords for each topic in the dictionary for each subreddit
    subreddit_topics[subreddit] = keyword_list



Top Topics and Keywords for r/Singapore:
Topic 1: china | gt | need | stupid | year | day | work | back | country | way
Topic 2:  |  |  |  |  |  |  |  |  | 
Topic 3:  |  |  |  |  |  |  |  |  | 
Topic 4:  |  |  |  |  |  |  |  |  | 
Topic 5:  |  |  |  |  |  |  |  |  | 
Topic 6: well | dl | mark | hit |  |  |  |  |  | 
Topic 7: ahh | ohhh | ahhh | ahhhhh | stone | set | well |  |  | 
Topic 8:  |  |  |  |  |  |  |  |  | 
Topic 9: amp | page | canonical | summon | web | shared | privacy | uamputatorbot | bot | load
Topic 10: riamatotalpieceofshit | riamapieceofshit | worthy |  |  |  |  |  |  | 

Top Topics and Keywords for r/SingaporeRaw:
Topic 1: china | stupid | need | country | chinese | woman | sg | life | look | guy
Topic 2:  |  |  |  |  |  |  |  |  | 
Topic 3: hdb | ssd | dweller | probaby | peks | stair | hdd | staircase | hdbs | lately
Topic 4: difference | physiological | spino | notable | rchina | implying | shape | tail | three | indeed
Topic 5: backfired | clownery | discipline

OpenAI API

In [None]:
!pip install python-dotenv

In [None]:
import os
from dotenv import load_dotenv

env_path = '/content/API.env'

load_dotenv(env_path)


In [None]:
import openai

# Replace with your actual API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Dictionary to store representative themes for each subreddit
representative_labels = {}

# Example subreddit names
subreddits = ["r/Singapore", "r/SingaporeRaw", "r/SingaporeHappenings"]

# Iterate over each subreddit
for subreddit in subreddits:
    # Initialize a list to store themes for the subreddit
    subreddit_themes = []

    # List of lists of keywords for each topic in the current subreddit
    keyword_groups = subreddit_topics.get(subreddit, [])

    # Process each group of keywords
    for topic_id, keywords in enumerate(keyword_groups):
        # Join the keywords for each prompt
        keywords_str = ', '.join([kw for kw in keywords if kw])  # Filter out empty strings

        # Skip if keywords list is empty
        if not keywords_str.strip():
            continue

        # Construct the prompt with Singaporean context
        prompt = (
            f"Given these keywords related to a topic in a Singaporean context: {keywords_str}, "
            "suggest a single concise word or phrase that best captures the main idea of this topic. The keywords are extracted from a dataset of toxic and harmful "
            "comments, so the topics generated should likely be about topics that may cause resentment and irritation."
        )

        # Send the request to the ChatGPT API
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=10,
            temperature=0.7
        )

        # Extract and process the response as a single theme
        theme = response.choices[0].message.content.strip().strip('"')
        subreddit_themes.append(theme)

    # Store the themes for the current subreddit in `representative_labels`
    representative_labels[subreddit] = subreddit_themes

# Simplified and readable output
for subreddit, themes in representative_labels.items():
    print(f"{subreddit}:")
    for theme in themes:
        print(f" - {theme}")
    print()  # Blank line between subreddits for readability


r/Singapore:
 - Foreign worker discrimination
 - Cyberbullying
 - Public behavior or etiquette
 - Web Privacy
 - Toxic Online Behavior

r/SingaporeRaw:
 - Xenophobic attitudes towards Chinese people in Singapore
 - HDB staircase disputes
 - Racial differences in Singapore
 - Inappropriate Messaging
 - Lost Directions
 - Rebellious
 - Facial Paralysis
 - Border Control
 - Online scams

r/SingaporeHappenings:
 - Reckless driving
 - religious insensitivity
 - Poor quality entertainment
 - Public hygiene and sanitation
 - Smartphone rivalry in Singapore.
 - Government incompetence
 - Inauthenticity
 - Singapore Marathon Chaos
 - HDB Scam
 - Housing Market Woes

