In [None]:
!pip install flair
!pip install contractions
!pip install simplemma

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import flair
from tqdm import tqdm
import re
import contractions
from simplemma import lemmatize

In [17]:
# reading in data
reddit_2021 = pd.read_csv('/content/drive/MyDrive/original/Reddit-Threads_2020-2021.csv', engine="python")
reddit_2223 = pd.read_csv('/content/drive/MyDrive/original/Reddit-Threads_2022-2023.csv', engine="python")

In [None]:
# Remove entries with text '[deleted]' or '[removed]'
reddit_2021 = reddit_2021[(reddit_2021['text'] != '[deleted]') & (reddit_2021['text'] != '[removed]')]
reddit_2223 = reddit_2223[(reddit_2223['text'] != '[deleted]') & (reddit_2223['text'] != '[removed]')]

In [None]:
# removing any special characters (eg. emoji)
valid_characters_pattern = r'[^a-zA-Z0-9\s.,!?\'"()\\-_$+=]'

reddit_2021.loc[:, 'text'] = reddit_2021['text'].str.replace(valid_characters_pattern, '', regex=True)
reddit_2223.loc[:, 'text'] = reddit_2223['text'].str.replace(valid_characters_pattern, '', regex=True)


In [None]:
# removing empty text
reddit_2021 = reddit_2021[reddit_2021['text'].notna() & (reddit_2021['text'].str.strip() != '')]
reddit_2223 = reddit_2223[reddit_2223['text'].notna() & (reddit_2223['text'].str.strip() != '')]

In [None]:
# handling contractions
reddit_2021['text'] = reddit_2021['text'].apply(lambda x: contractions.fix(x))
reddit_2223['text'] = reddit_2223['text'].apply(lambda x: contractions.fix(x))

In [None]:
# normalization - converting all text to lower case
reddit_2021['text'] = reddit_2021['text'].str.lower()
reddit_2223['text'] = reddit_2223['text'].str.lower()

In [None]:
# function to remove single-letter words
def remove_single_letter_words(text):
    if isinstance(text, str):  # Ensure the text is a string
        # Remove single-letter words using regex
        text = re.sub(r'\b\w{1}\b', '', text)
        # Clean up extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# removing single letters
reddit_2021['text'] = reddit_2021['text'].apply(remove_single_letter_words)
reddit_2223['text'] = reddit_2223['text'].apply(remove_single_letter_words)

In [None]:
# checking for empty text again
reddit_2021 = reddit_2021[reddit_2021['text'].notna() & (reddit_2021['text'].str.strip() != '')]
reddit_2223 = reddit_2223[reddit_2223['text'].notna() & (reddit_2223['text'].str.strip() != '')]

In [23]:
# Creating 2 new columns: year and month
reddit_2021.dropna(subset=['timestamp'], inplace=True)
reddit_2021['timestamp'] = pd.to_datetime(reddit_2021['timestamp'])
reddit_2021['year'] = reddit_2021['timestamp'].dt.year.astype(int)
reddit_2021['month'] = reddit_2021['timestamp'].dt.month.astype(int)

reddit_2223['timestamp'] = pd.to_datetime(reddit_2223['timestamp'])
reddit_2223['year'] = reddit_2223['timestamp'].dt.year.astype(int)
reddit_2223['month'] = reddit_2223['timestamp'].dt.month.astype(int)


In [19]:
# changing subreddit id to name
reddit_2021['subreddit_name'] = reddit_2021['subreddit_id'].replace({
    "t5_2qh8c": "r/Singapore",
    "t5_xnx04": "r/SingaporeRaw"
})

reddit_2223['subreddit_name'] = reddit_2223['subreddit_id'].replace({
    "t5_2qh8c": "r/Singapore",
    "t5_xnx04": "r/SingaporeRaw",
    "t5_70s6ew": "r/SingaporeHappenings"
})


In [None]:
# taking a random subset of data - 20% of original data
from sklearn.model_selection import train_test_split

train, sample_2021 = train_test_split(reddit_2021, test_size=0.2, stratify=reddit_2021['subreddit_id'], random_state=42)

train, sample_2223 = train_test_split(reddit_2223, test_size=0.2, stratify=reddit_2223['subreddit_id'], random_state=42)

In [None]:
sample_2021.to_csv('working_2021.csv', index=False)
sample_2223.to_csv('working_2223.csv', index=False)

**Checking spread of data**

In [24]:
reddit2020 = reddit_2021[reddit_2021['year'] == 2020]
reddit2021 = reddit_2021[reddit_2021['year'] == 2021]
reddit2022 = reddit_2223[reddit_2223['year'] == 2022]
reddit2023 = reddit_2223[reddit_2223['year'] == 2023]

In [25]:
len(reddit2020), len(reddit2021), len(reddit2022), len(reddit2023)

(1435876, 1513543, 1141853, 841456)

In [None]:
totalfull = len(reddit2020) + len(reddit2021) + len(reddit2022) + len(reddit2023)

In [None]:
(len(reddit2020)/totalfull * 100) , (len(reddit2021)/totalfull * 100), (len(reddit2022)/totalfull * 100) , (len(reddit2023)/totalfull * 100)

(29.10916636798137, 30.68369064744701, 23.148509303574006, 17.058633680997616)

In [None]:
sample2020 = sample_2021[sample_2021['year'] == 2020]
sample2021 = sample_2021[sample_2021['year'] == 2021]
sample2022 = sample_2223[sample_2223['year'] == 2022]
sample2023 = sample_2223[sample_2223['year'] == 2023]

In [None]:
len(sample2020), len(sample2021), len(sample2022), len(sample2023)

(259737, 270937, 207619, 159056)

In [None]:
total = len(sample2020) + len(sample2021) + len(sample2022) + len(sample2023)

In [None]:
(len(sample2020)/total * 100) , (len(sample2021)/total * 100), (len(sample2022)/total * 100) , (len(sample2023)/total * 100)

(28.944925552934254, 30.19304640669349, 23.136928887199964, 17.725099153172287)

In [27]:
sg2021 = reddit_2021[reddit_2021['subreddit_name'] == 'r/Singapore']
sgraw2021 = reddit_2021[reddit_2021['subreddit_name'] == 'r/SingaporeRaw']

sg2223 = reddit_2223[reddit_2223['subreddit_name'] == 'r/Singapore']
sgraw2223 = reddit_2223[reddit_2223['subreddit_name'] == 'r/SingaporeRaw']
sghpn2223 = reddit_2223[reddit_2223['subreddit_name'] == 'r/SingaporeHappenings']

In [28]:
len(sg2021), len(sgraw2021), len(sg2223), len(sgraw2223), len(sghpn2223)

(2867222, 82197, 1577054, 358234, 48021)

In [31]:
samplesg2021 = sample_2021[sample_2021['subreddit_name'] == 'r/Singapore']
samplesgraw2021 = sample_2021[sample_2021['subreddit_name'] == 'r/SingaporeRaw']

samplesg2223 = sample_2223[sample_2223['subreddit_name'] == 'r/Singapore']
samplesgraw2223 = sample_2223[sample_2223['subreddit_name'] == 'r/SingaporeRaw']
samplesghpn2223 = sample_2223[sample_2223['subreddit_name'] == 'r/SingaporeHappenings']

In [32]:
len(samplesg2021), len(samplesgraw2021), len(samplesg2223), len(samplesgraw2223), len(samplesghpn2223)

(516086, 14588, 290475, 66853, 9347)

**Reading in sample size data**

In [None]:
sample_2021 = pd.read_csv('working_2021.csv')
sample_2223 = pd.read_csv('working_2223.csv')

**SubWord Tokenization with BERT**

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('zanelim/singbert')
model = BertModel.from_pretrained("zanelim/singbert")

In [None]:
# Tokenization (subword with bert)
subtokenization_2021 = []

for text in tqdm(sample_2021['text']):
    tokens = tokenizer.tokenize(text)
    subtokenization_2021.append(tokens)

100%|██████████| 530674/530674 [06:30<00:00, 1359.55it/s]


In [None]:
# Tokenization (subword with bert)
subtokenization_2223 = []

for text in tqdm(sample_2223['text']):
    tokens = tokenizer.tokenize(text)
    subtokenization_2223.append(tokens)

100%|██████████| 366675/366675 [04:12<00:00, 1450.76it/s]


**Lemmatization**

In [None]:
# Lemmatization
sublemmatization_2021 = []

for tokens in tqdm(subtokenization_2021):
    lemmatized_tokens = [lemmatize(token, lang='en') for token in tokens]
    sublemmatization_2021.append(lemmatized_tokens)

100%|██████████| 530674/530674 [00:11<00:00, 45060.23it/s]


In [None]:
# Lemmatization
sublemmatization_2223 = []

for tokens in tqdm(subtokenization_2223):
    lemmatized_tokens = [lemmatize(token, lang='en') for token in tokens]
    sublemmatization_2223.append(lemmatized_tokens)

100%|██████████| 366675/366675 [00:09<00:00, 37848.35it/s]


**Token input ids**

In [None]:
input_ids_2021 = []

for text in tqdm(sample_2021['text']):
    tokens_id = tokenizer(text)['input_ids']
    input_ids_2021.append(tokens_id)

100%|██████████| 530674/530674 [07:20<00:00, 1205.00it/s]


In [None]:
input_ids_2223 = []

for text in tqdm(sample_2223['text']):
    tokens_id = tokenizer(text)['input_ids']
    input_ids_2223.append(tokens_id)

100%|██████████| 366675/366675 [04:47<00:00, 1274.44it/s]


**Adding new columns**

In [None]:
sample_2021['Tokenization'] = sublemmatization_2021
sample_2223['Tokenization'] = sublemmatization_2223

In [None]:
sample_2021['Input IDs'] = input_ids_2021
sample_2223['Input IDs'] = input_ids_2223

In [None]:
sample_2021.to_csv('sample_2021.csv', index=False)
sample_2223.to_csv('sample_2223.csv', index=False)

In [None]:
sample_2021.to_csv('sample_2021wcleantext.csv', index=False)
sample_2223.to_csv('sample_2223wcleantext.csv', index=False)

**Working with new data**

In [None]:
sample_2021 = pd.read_csv('sample_2021.csv')
sample_2223 = pd.read_csv('sample_2223.csv')

**Embedding- Singbert**

In [None]:
import torch

In [None]:
# Function to get embeddings
def get_sentence_embedding(sentence):
    # Tokenize and convert to tensors
    inputs = tokenizer(sentence, max_length=512, return_tensors="pt", padding=True, truncation=True)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Return the pooler_output as the sentence embedding
    return outputs.pooler_output.squeeze().numpy()

In [None]:
singbert_2021 = []
#import numpy as np

for sentence in tqdm(sample_2021['text']):
    output = get_sentence_embedding(sentence)
    singbert_2021.append(output)

100%|██████████| 530674/530674 [8:30:33<00:00, 17.32it/s]


In [None]:
singbert_2223 = []
#import numpy as np

for sentence in tqdm(sample_2223['text']):
    output = get_sentence_embedding(sentence)
    singbert_2223.append(output)

100%|██████████| 366675/366675 [5:03:10<00:00, 20.16it/s]


**Stopwords removal for hateful/toxic text**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [34]:
# reading in data obtained from roberta hate and roberta toxic model
samplehatetoxic_2021 = pd.read_csv('/content/drive/MyDrive/Data Sample/samplehatetoxic_2021.csv')
samplehatetoxic_2223 = pd.read_csv('/content/drive/MyDrive/Data Sample/samplehatetoxic_2223.csv')

In [35]:
# reading in data obtained from bertoxic
df = pd.read_csv('/content/drive/MyDrive/Data Sample/hatetoxic(lynn)_2021.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Data Sample/hatetoxic(lynn)_2223.csv')

In [49]:
# adding toxic and hate columns from df to samplehatetoxic
samplehatetoxic_2021['hate_label2'] = df['id_att']
samplehatetoxic_2021['toxic_label2'] = df['toxicity']

samplehatetoxic_2223['hate_label2'] = df2['id_att']
samplehatetoxic_2223['toxic_label2'] = df2['toxicity']

In [16]:
samplehatetoxic_2021.to_csv('/content/drive/My Drive/hateandtoxic/combined_2021.csv', index=False)
samplehatetoxic_2223.to_csv('/content/drive/My Drive/hateandtoxic/combined_2223.csv', index=False)

In [53]:
# Filter out rows where hate_label is 'HATE' or toxic_label is 'toxic'
hatetoxic_2021 = samplehatetoxic_2021[
    (samplehatetoxic_2021['hate_label'] == 'HATE') |
    (samplehatetoxic_2021['toxic_label'] == 'toxic') |
    (samplehatetoxic_2021['hate_label2'] == 1.) |
    (samplehatetoxic_2021['toxic_label2'] == 1.)
]

hatetoxic_2223 = samplehatetoxic_2223[
    (samplehatetoxic_2223['hate_label'] == 'HATE') |
    (samplehatetoxic_2223['toxic_label'] == 'toxic') |
    (samplehatetoxic_2223['hate_label2'] == 1.) |
    (samplehatetoxic_2223['toxic_label2'] == 1.)
]

In [39]:
# Define the file paths in your Google Drive
hatetoxic_2021_path = '/content/drive/My Drive/hateandtoxic/hatetoxic_2021.csv'
hatetoxic_2223_path = '/content/drive/My Drive/hateandtoxic/hatetoxic_2223.csv'

# Save the DataFrames as CSV files
hatetoxic_2021.to_csv(hatetoxic_2021_path, index=False)
hatetoxic_2223.to_csv(hatetoxic_2223_path, index=False)

In [54]:
hatetoxic_2021 = pd.read_csv('/content/drive/MyDrive/hateandtoxic/hatetoxic_2021.csv')
hatetoxic_2223 = pd.read_csv('/content/drive/MyDrive/hateandtoxic/hatetoxic_2223.csv')

In [57]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [58]:
# removal of commonly used words in Singapore
stop_words.update(['lah', 'lor', 'leh', 'liao', 'meh', 'mah', 'ah', 'hor', 'wah', 'lah','la', 'hor', 'sian', 'see', 'yeah', 'ya', 'yah', 'le','ba', 'bah', 'haha', 'bro','want','wants'
                    'already', 'also', 'one', 'can', 'cannot', 'got', 'like', 'really', 'lol', 'lmao', 'yes', 'no', 'eh' ,'ah', 'omg', 'go', 'get', 'must', 'man','one', 'know', 'need'
                    'sia', 'walao', 'siao', 'alamak', 'confirm', 'makan','aiyah', 'aiyo', 'aiyah','sure','even','probably','think', 'ok', 'okay', 'tbh','make', 'n',
                    'still', 'maybe','said','you know', 'i mean', 'like that', 'do not know', 'not sure', 'of course', 'how come','always','alway','say', 'damn','give', 'going', 'take', 'took',
                   'would','should','could','thing', 'right','oh','ah','as','sia'])

In [59]:
def preprocess(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Remove punctuation and non-alphabetic tokens
    words = [word for word in words if word.isalpha()]
    # Stopword removal and lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [60]:
# Removing stopwords and punctuation
hatetoxic_preprocessed_2021 = [preprocess(text) for text in hatetoxic_2021['text']]
hatetoxic_preprocessed_2223 = [preprocess(text) for text in hatetoxic_2223['text']]

In [61]:
# Convert all entries to string
hatetoxic_preprocessed_2021 = [str(text) for text in hatetoxic_preprocessed_2021]
hatetoxic_preprocessed_2223 = [str(text) for text in hatetoxic_preprocessed_2223]

In [62]:
hatetoxic_2021.loc[:, 'text without punctuation and stopword'] = hatetoxic_preprocessed_2021
hatetoxic_2223.loc[:, 'text without punctuation and stopword'] = hatetoxic_preprocessed_2223

In [63]:
# checking and removing empty rows in 'text without punctuation and stopword' column after cleaning
hatetoxic_2021 = hatetoxic_2021[
    (hatetoxic_2021['text without punctuation and stopword'].str.strip() != '') &  # Condition to check for non-empty strings
    (hatetoxic_2021['text without punctuation and stopword'].notna())            # Condition to check for non-NaN values
]


hatetoxic_2223 = hatetoxic_2223[
    (hatetoxic_2223['text without punctuation and stopword'].str.strip() != '') &  # Condition to check for non-empty strings
    (hatetoxic_2223['text without punctuation and stopword'].notna())            # Condition to check for non-NaN values
]

In [74]:
# adding files with text wo punc and stopwords column
file_path = '/content/drive/MyDrive/Data Sample/hatetoxic_2021.csv'
hatetoxic_2021.to_csv(file_path, index=False)

In [75]:
file_path = '/content/drive/MyDrive/Data Sample/hatetoxic_2223.csv'
hatetoxic_2223.to_csv(file_path, index=False)