In [None]:
!pip install flair
!pip install contractions
!pip install simplemma

Collecting simplemma
  Downloading simplemma-1.1.1-py3-none-any.whl.metadata (23 kB)
Downloading simplemma-1.1.1-py3-none-any.whl (67.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simplemma
Successfully installed simplemma-1.1.1


In [None]:
import pandas as pd
import flair
from tqdm import tqdm
import re
import contractions
from simplemma import lemmatize

In [None]:
# reading in data
reddit_2021 = pd.read_csv("Reddit-Threads_2020-2021.csv", engine="python")
reddit_2223 = pd.read_csv("Reddit-Threads_2022-2023.csv", engine="python")

In [None]:
# Remove entries with text '[deleted]' or '[removed]'
reddit_2021 = reddit_2021[(reddit_2021['text'] != '[deleted]') & (reddit_2021['text'] != '[removed]')]
reddit_2223 = reddit_2223[(reddit_2223['text'] != '[deleted]') & (reddit_2223['text'] != '[removed]')]

In [None]:
# removing any special characters (eg. emoji)
valid_characters_pattern = r'[^a-zA-Z0-9\s.,!?\'"()\\-_$+=]'

reddit_2021.loc[:, 'text'] = reddit_2021['text'].str.replace(valid_characters_pattern, '', regex=True)
reddit_2223.loc[:, 'text'] = reddit_2223['text'].str.replace(valid_characters_pattern, '', regex=True)


In [None]:
# removing empty text
reddit_2021 = reddit_2021[reddit_2021['text'].notna() & (reddit_2021['text'].str.strip() != '')]
reddit_2223 = reddit_2223[reddit_2223['text'].notna() & (reddit_2223['text'].str.strip() != '')]

In [None]:
# handling contractions
reddit_2021['text'] = reddit_2021['text'].apply(lambda x: contractions.fix(x))
reddit_2223['text'] = reddit_2223['text'].apply(lambda x: contractions.fix(x))

In [None]:
# normalization - converting all text to lower case
reddit_2021['text'] = reddit_2021['text'].str.lower()
reddit_2223['text'] = reddit_2223['text'].str.lower()

In [None]:
# function to remove single-letter words
def remove_single_letter_words(text):
    if isinstance(text, str):  # Ensure the text is a string
        # Remove single-letter words using regex
        text = re.sub(r'\b\w{1}\b', '', text)
        # Clean up extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# removing single letters
reddit_2021['text'] = reddit_2021['text'].apply(remove_single_letter_words)
reddit_2223['text'] = reddit_2223['text'].apply(remove_single_letter_words)

In [None]:
# checking for empty text again
reddit_2021 = reddit_2021[reddit_2021['text'].notna() & (reddit_2021['text'].str.strip() != '')]
reddit_2223 = reddit_2223[reddit_2223['text'].notna() & (reddit_2223['text'].str.strip() != '')]

In [None]:
# Creating 2 new columns: year and month
reddit_2021.dropna(subset=['timestamp'], inplace=True)
reddit_2021['timestamp'] = pd.to_datetime(reddit_2021['timestamp'])
reddit_2021['year'] = reddit_2021['timestamp'].dt.year.astype(int)
reddit_2021['month'] = reddit_2021['timestamp'].dt.month.astype(int)

reddit_2223['timestamp'] = pd.to_datetime(reddit_2223['timestamp'])
reddit_2223['year'] = reddit_2223['timestamp'].dt.year.astype(int)
reddit_2223['month'] = reddit_2223['timestamp'].dt.month.astype(int)


In [None]:
# changing subreddit id to name
reddit_2021['subreddit_name'] = reddit_2021['subreddit_id'].replace({
    "t5_2qh8c": "r/Singapore",
    "t5_xnx04": "r/SingaporeRaw"
})

reddit_2223['subreddit_name'] = reddit_2223['subreddit_id'].replace({
    "t5_2qh8c": "r/Singapore",
    "t5_xnx04": "r/SingaporeRaw",
    "t5_70s6ew": "r/SingaporeHappenings"
})


In [None]:
# taking a random subset of data - 20% of original data
from sklearn.model_selection import train_test_split

train, sample_2021 = train_test_split(reddit_2021, test_size=0.2, stratify=reddit_2021['subreddit_id'], random_state=42)

train, sample_2223 = train_test_split(reddit_2223, test_size=0.2, stratify=reddit_2223['subreddit_id'], random_state=42)

In [None]:
sample_2021.to_csv('working_2021.csv', index=False)
sample_2223.to_csv('working_2223.csv', index=False)

**Reading in sample size data**

In [None]:
sample_2021 = pd.read_csv('working_2021.csv')
sample_2223 = pd.read_csv('working_2223.csv')

**SubWord Tokenization with BERT**

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('zanelim/singbert')
model = BertModel.from_pretrained("zanelim/singbert")

In [None]:
# Tokenization (subword with bert)
subtokenization_2021 = []

for text in tqdm(sample_2021['text']):
    tokens = tokenizer.tokenize(text)
    subtokenization_2021.append(tokens)

100%|██████████| 530674/530674 [06:30<00:00, 1359.55it/s]


In [None]:
# Tokenization (subword with bert)
subtokenization_2223 = []

for text in tqdm(sample_2223['text']):
    tokens = tokenizer.tokenize(text)
    subtokenization_2223.append(tokens)

100%|██████████| 366675/366675 [04:12<00:00, 1450.76it/s]


**Lemmatization**

In [None]:
# Lemmatization
sublemmatization_2021 = []

for tokens in tqdm(subtokenization_2021):
    lemmatized_tokens = [lemmatize(token, lang='en') for token in tokens]
    sublemmatization_2021.append(lemmatized_tokens)

100%|██████████| 530674/530674 [00:11<00:00, 45060.23it/s]


In [None]:
# Lemmatization
sublemmatization_2223 = []

for tokens in tqdm(subtokenization_2223):
    lemmatized_tokens = [lemmatize(token, lang='en') for token in tokens]
    sublemmatization_2223.append(lemmatized_tokens)

100%|██████████| 366675/366675 [00:09<00:00, 37848.35it/s]


**Token input ids**

In [None]:
input_ids_2021 = []

for text in tqdm(sample_2021['text']):
    tokens_id = tokenizer(text)['input_ids']
    input_ids_2021.append(tokens_id)

100%|██████████| 530674/530674 [07:20<00:00, 1205.00it/s]


In [None]:
input_ids_2223 = []

for text in tqdm(sample_2223['text']):
    tokens_id = tokenizer(text)['input_ids']
    input_ids_2223.append(tokens_id)

100%|██████████| 366675/366675 [04:47<00:00, 1274.44it/s]


**Removing stop words and punctuation (useful for topic modeling)**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
lemmatizer = WordNetLemmatizer()
#removing commonly used singlish terms that does not hold sentiment value
stop_words = set(stopwords.words('english'))
stop_words.update(['also', 'mr', 'mrs','miss','mdm','ya', 'yah', 'la', 'lah','lor','leh','liao','hor','mah','meh','sia','lol','lmao','like','yes','no'])

def preprocess(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Remove punctuation and non-alphabetic tokens
    words = [word for word in words if word.isalpha()]
    # Stopword removal and lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [None]:
# Removing stopwords and punctuation
texts_preprocessed_2021 = [preprocess(text) for text in sample_2021['text']]
text_preprocessed_2223 = [preprocess(text) for text in sample_2223['text']]

**Adding new columns**

In [None]:
sample_2021['Tokenization'] = sublemmatization_2021
sample_2223['Tokenization'] = sublemmatization_2223

In [None]:
sample_2021['Input IDs'] = input_ids_2021
sample_2223['Input IDs'] = input_ids_2223

In [None]:
sample_2021.to_csv('sample_2021.csv', index=False)
sample_2223.to_csv('sample_2223.csv', index=False)

In [None]:
sample_2021['text without punctuation and stopword'] = texts_preprocessed_2021
sample_2223['text without punctuation and stopword'] = text_preprocessed_2223

In [None]:
sample_2021.to_csv('sample_2021wcleantext.csv', index=False)
sample_2223.to_csv('sample_2223wcleantext.csv', index=False)

**Working with new data**

In [None]:
sample_2021 = pd.read_csv('sample_2021.csv')
sample_2223 = pd.read_csv('sample_2223.csv')

**Embedding- Singbert**

In [None]:
import torch

In [None]:
# Function to get embeddings
def get_sentence_embedding(sentence):
    # Tokenize and convert to tensors
    inputs = tokenizer(sentence, max_length=512, return_tensors="pt", padding=True, truncation=True)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Return the pooler_output as the sentence embedding
    return outputs.pooler_output.squeeze().numpy()

In [None]:
singbert_2021 = []
#import numpy as np

for sentence in tqdm(sample_2021['text']):
    output = get_sentence_embedding(sentence)
    singbert_2021.append(output)

100%|██████████| 530674/530674 [8:30:33<00:00, 17.32it/s]


In [None]:
singbert_2223 = []
#import numpy as np

for sentence in tqdm(sample_2223['text']):
    output = get_sentence_embedding(sentence)
    singbert_2223.append(output)

100%|██████████| 366675/366675 [5:03:10<00:00, 20.16it/s]


**Embedding - Singbert: for text without stopwords and punctuation (useful for topic modeling)**

In [None]:
sample_2021wcleantext = pd.read_csv('sample_2021wcleantext.csv')
sample_2223wcleantext = pd.read_csv('sample_2223wcleantext.csv')

In [None]:
# checking and removing empty rows in 'text without punctuation and stopword' column after cleaning
sample_2021wcleantext = sample_2021wcleantext[
    (sample_2021wcleantext['text without punctuation and stopword'].str.strip() != '') &  # Condition to check for non-empty strings
    (sample_2021wcleantext['text without punctuation and stopword'].notna())            # Condition to check for non-NaN values
]


sample_2223wcleantext = sample_2223wcleantext[
    (sample_2223wcleantext['text without punctuation and stopword'].str.strip() != '') &  # Condition to check for non-empty strings
    (sample_2223wcleantext['text without punctuation and stopword'].notna())            # Condition to check for non-NaN values
]

In [None]:
sample_2021wcleantext.to_csv('sample(punc_stopwords_removed)_2021.csv', index=False)
sample_2223wcleantext.to_csv('sample(punc_stopwords_removed)_2223.csv', index=False)

In [None]:
sample_cleaned_2021 = pd.read_csv('sample(punc_stopwords_removed)_2021.csv')
sample_cleaned_2223 = pd.read_csv('sample(punc_stopwords_removed)_2223.csv')

In [None]:
subsingbert_2021 = []
#import numpy as np

for sentence in tqdm(sample_cleaned_2021['text without punctuation and stopword']):
    output = get_sentence_embedding(sentence)
    subsingbert_2021.append(output)

100%|██████████| 524379/524379 [6:03:41<00:00, 24.03it/s]


In [None]:
subsingbert_2223 = []
#import numpy as np

for sentence in tqdm(sample_cleaned_2223['text without punctuation and stopword']):
    output = get_sentence_embedding(sentence)
    subsingbert_2223.append(output)

100%|██████████| 362624/362624 [4:29:11<00:00, 22.45it/s]
