In [None]:
# VADER

In [16]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [17]:
# Example Test
analyzer.polarity_scores("Hais I really wish I was in NUS. So tired of literally no university life.")

{'neg': 0.293, 'neu': 0.544, 'pos': 0.163, 'compound': -0.34}

### Prepare the Data

In [13]:
# View raw html
# view-source:file:///Users/gin/Downloads/Telegram%20Lite/ChatExport_2025-04-27/messages.html

In [19]:
# prepare the data, convert to 'confess_data.csv'

from bs4 import BeautifulSoup
import pandas as pd
import glob
import re

data = []
x = 1 # for poll counter

# Loop through all HTML files in a folder
for filename in glob.glob('smu_confess_data/*.html'):
    # if filename == 'smu_confess_data/messages38.html': # look at the first file only for now
    with open(filename, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html')

        # go by the hierarchy - get everything that has div class "text", displayed as a list
        div = soup.find_all('div', class_='body')[1:] # remove the first item which has class "text bold"
        
        # go through each confession
        for text_chunk in div:
            # if need to print the text chunk
            # print("\n--------------TEXT CHUNK--------------")
            # print(text_chunk)

            # GET DATETIME
            datetime = text_chunk.find('div', class_='pull_right date details')
            if datetime:
                datetime = datetime.get('title')
                # split datetime into date and time
                dt = pd.to_datetime(datetime)
                
                date = dt.date()
                time = dt.time()
                day_of_week = dt.day_name()
                
                # print('Datetime:', datetime)

            # GET CATEGORY, ID
            # there are 2 or more instances of "return ShowHastag" - get everything and index from there
            category_and_id = text_chunk.find_all('a', onclick=re.compile(r'ShowHashtag')) 
            # print(category_and_id)

            category_wo_hashtag = 'null'
            ref_id_wo_hashtag = 'null'
            id_wo_hashtag = 'null'
            total_votes = 'null'
            

            # 1. normal confessions - have 2 hashtags
            if len(category_and_id)==2:
                
                # Get category
                category = category_and_id[0].text
                category_wo_hashtag = category[1:]
                # print('Category:', category_wo_hashtag) # remove the hashtag
                
                # Get id
                id = category_and_id[1].text
                id_wo_hashtag = id[1:]
                # print('ID:', id_wo_hashtag)

                # GET CONFESSION
                text = text_chunk.get_text(separator=' ', strip=True) # found out that could get all the text in text_chunk like this too
                text = text.replace(category, "").replace(id, "").replace("SMU Confess", "") # replace the category, id, also "SMU Confess" that sometimes appears
                confession = text.split("|")[0][9:].lstrip().rstrip()
                # print('Confession:', confession)

                # Set type
                type = 'confession'

                # add to data
                data.append([id_wo_hashtag, ref_id_wo_hashtag, category_wo_hashtag, confession, total_votes, datetime, date, day_of_week, time, type])

            # 2. confession replying to another confession, will have more confession ids - have 3 hastags
            elif len(category_and_id)==3:

                # Get category
                category = category_and_id[0].text
                category_wo_hashtag = category[1:]
                # print('Category:', category_wo_hashtag)

                # Get previous confession reference id
                ref_id = category_and_id[1].text
                ref_id_wo_hashtag = ref_id[1:]
                # print('Confession Reference ID:', ref_id_wo_hashtag)
                
                # Get id
                id = category_and_id[2].text
                id_wo_hashtag = id[1:]
                # print('ID:', id_wo_hashtag)

                # GET CONFESSION
                text = text_chunk.get_text(separator=' ', strip=True) # found out that could get all the text in text_chunk like this too
                text = text.replace(category, "").replace(id, "").replace(ref_id, "").replace("SMU Confess", "") # replace the category, id, also "SMU Confess" that sometimes appears
                confession = text.split("|")[0][9:].lstrip().rstrip()
                # print('Confession:', confession)

                # Set type
                type = 'confession'

                # add to data
                data.append([id_wo_hashtag, ref_id_wo_hashtag, category_wo_hashtag, confession, total_votes, datetime, date, day_of_week, time, type])

            # 3. others - people copy pasting the entire confession, resulting in more confession ids / polls - more than 3 or no hashtags
            else:           

                # IF IT IS A POLL
                # Get confession
                if text_chunk.find('div', class_='media_poll'):

                    # set an id for the polls since there is no official id in the confessions  
                    # if len(str(x)) < 8:
                    #     str_x = ((8-len(str(x))) * '0') + str(x)
                    id_wo_hashtag = f"P{int(x):08d}" # makes the id 8 digits
                    x += 1
                    
                    # get the poll text
                    poll_div = text_chunk.find('div', class_='media_poll')
                    confession = poll_div.get_text(separator=' ', strip=True)
                    # print(confession)

                    # get the total votes
                    total_votes = poll_div.select_one('div.total.details').get_text(strip=True)
                    # print(total_votes)

                    # need to remove the total votes from the poll text
                    confession = confession.replace(total_votes, "")
                    # print(confession)

                    # Set type
                    type = 'poll'

                    # add to data
                    data.append([id_wo_hashtag, ref_id_wo_hashtag, category_wo_hashtag, confession, int(total_votes.replace(' votes', '')), datetime, date, day_of_week, time, type])


                # IF IT HAS MORE THAN 3 HASHTAGS
                continue   
                             

# collate into csv, sort by datetime
df = pd.DataFrame(data, columns=['id', 'ref_id', 'category', 'confession', 'total_votes', 'datetime', 'date', 'day_of_week', 'time', 'type'])
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(by='datetime', ascending=True)
df.to_csv('confess_data.csv', index=False)

  dt = pd.to_datetime(datetime)
  df['datetime'] = pd.to_datetime(df['datetime'])


### Vader

In [None]:
# perform vader analysis on 'confess_data.csv' - not all rows were able to be analysed
# save it as 'confess_data_vader.csv'

# Read the csv
confess_df = pd.read_csv('confess_data.csv')
display(confess_df)
vader_results = {}

# Apply vader
for i, row in confess_df.iterrows():
    confession = row['confession']
    id = row['id']
    if type(confession) != float: # some confessions cannot be analysed as they are NaN, which is float data type
        vader_results[id] = analyzer.polarity_scores(confession) # perform vader sentiment analysis on confession
    
# create df to store the analysis results
vader_df = pd.DataFrame(vader_results).T
display(vader_df)

# combine the dfs
# confess_df = confess_df.join(vader_df)
confess_df_vader = pd.merge(confess_df, vader_df, left_on='id', right_index=True, how='left')

# wrangle datetime


# create a new csv to store the vader df
confess_df_vader.to_csv('confess_data_vader.csv', index=False)
display(confess_df_vader)

In [None]:
# perform visualisation on the data
!pip install plotly.express

In [None]:
import plotly.express as px

hist = px.histogram(confess_df_vader, nbins=20, x='pos')
hist.show()


In [None]:
import seaborn as sns

bar = sns.histplot(data=confess_df_vader, bins=20, x='neg')

### ML - Multilingual Sentiment Analysis (doesn't work, max tokens exceeded, can only take in 512)
https://huggingface.co/tabularisai/multilingual-sentiment-analysis

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")

# Classify a new sentence
sentence = "I love this product! It's amazing and works perfectly."
result = pipe(sentence)

# Print the result
print(result)

In [None]:
# cannot use the pipeline as it has restriction on max tokens

# # Read the csv
# confess_df = pd.read_csv('confess_data.csv')
# display(confess_df)
# ml_results = {}

# # Apply vader
# for i, row in confess_df.iterrows():
#     confession = row['confession']
#     id = row['id']
#     if type(confession) != float: # some confessions cannot be analysed as they are NaN, which is float data type
#         ml_results[id] = pipe(confession) # perform vader sentiment analysis on confession
    
# # create df to store the analysis results
# ml_df = pd.DataFrame(ml_results).T
# display(ml_df)

# # combine the dfs
# # confess_df = confess_df.join(vader_df)
# confess_df_ml = pd.merge(confess_df, ml_df, left_on='id', right_index=True, how='left')

# # create a new csv to store the vader df
# confess_df_ml.to_csv('confess_data_ml.csv', index=False)
# display(confess_df_ml)

In [None]:
# need to use without pipeline to change the max_length of tokens
# tried to tweak the max_length of tokens, but it cannot be extended, max can only be 512

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

ml_results = {}


def predict_sentiment(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=2000)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
    results = []
    for probs in probabilities:
        pred_idx = torch.argmax(probs).item()
        pred_label = sentiment_map[pred_idx]
        # Convert tensor to list and pair with label
        score_dict = {sentiment_map[i]: float(probs[i]) for i in range(len(probs))}
        results.append((pred_label, score_dict))
    return results[0][1]

# print(predict_sentiment('helloooo'))



# Read the csv
confess_df = pd.read_csv('confess_data.csv')
display(confess_df)
ml_results = {}


# Apply ml
for i, row in confess_df.iterrows():
    confession = row['confession']
    id = row['id']
    if type(confession) != float: # some confessions cannot be analysed as they are NaN, which is float data type
        ml_results[id] = predict_sentiment(confession) # perform vader sentiment analysis on confession

ml_results


# create df to store the analysis results
ml_df = pd.DataFrame(ml_results).T
display(ml_df)

# combine the dfs
# confess_df = confess_df.join(vader_df)
confess_df_ml = pd.merge(confess_df, ml_df, left_on='id', right_index=True, how='left')

# create a new csv to store the vader df
confess_df_ml.to_csv('confess_data_ml.csv', index=False)
display(confess_df_ml)

