In [26]:
import json
import pandas as pd
from pathlib import Path
from textblob import TextBlob
import re
import emoji

In [27]:
data = []

with open("../json_files/posts.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue  # skip blank lines
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping bad JSON on line {i}: {e}")


# Cleaning data
#df.head()


LABEL_COL = "id" 
TEXT_COL = "text"
df = df[df[LABEL_COL].notna() & df[TEXT_COL].notna()]


def preprocess_text(t: str) -> str:

    t = emoji.demojize(t, delimiters=(" ", " ")) #convert emojis to text
    t = t.lower()
    
    # remove URLs
    t = re.sub(r"http\S+|www\S+", "", t)
    
    # remove user mentions (Reddit / Twitter style)
    t = re.sub(r"u\/\w+|@\w+", "", t)
    
    # remove extra whitespace
    t = re.sub(r"\s+", " ", t).strip()
    
    return t


df["text_clean"] = df[TEXT_COL].apply(preprocess_text)




Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,text_length,comment_ratio,high_engagement,created_hour,created_day,text_clean
0,cc3048dc-8259-43db-9f90-19e7cc53ea15,it’s happening!,2025-12-03 15:27:21.559000+00:00,92,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,XWllrpGP,15,0.0,0,15,2025-12-03,it’s happening!
1,f293eaa5-c405-405d-9b07-bfe84a7626de,I feel a sense of accomplishment when I have more minutes listened than ppl ...,2025-12-03 14:49:19.375000+00:00,86,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,0VHjUus2,121,0.0,0,14,2025-12-03,i feel a sense of accomplishment when i have more minutes listened than ppl ...
2,ac5887f9-1ed2-4610-a138-31eb6535065d,Life would be so much better if they had diet coke on campus,2025-12-03 13:53:26.506000+00:00,58,5,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,oNVugCvN,60,0.084746,1,13,2025-12-03,life would be so much better if they had diet coke on campus
3,fcc8b55f-319b-4748-af40-ec39ca98c8eb,They won’t bat an eye at the sex toy you ordered dw,2025-12-03 02:33:16.329000+00:00,149,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,JQe0aPio,51,0.0,0,2,2025-12-03,they won’t bat an eye at the sex toy you ordered dw
4,7926355f-f26c-4531-a026-a4251d5a39fe,Can raise tuition but can’t even put those damn lights on the Chambers Trees,2025-12-03 03:07:56.006000+00:00,133,1,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,rIGwEFCb,76,0.007463,0,3,2025-12-03,can raise tuition but can’t even put those damn lights on the chambers trees


In [28]:
df["text_length"] = df["text"].apply(len)
df["comment_ratio"] = df["comment_count"] / (df["vote_total"] + 1)
threshold = df["comment_ratio"].quantile(0.75)
df["high_engagement"] = (df["comment_ratio"] >= threshold).astype(int)
df["created_at"] = pd.to_datetime(df["created_at"])
df["created_hour"] = df["created_at"].dt.hour
df["created_day"] = df["created_at"].dt.date
df.to_csv("../csv_files/yikyak_engagement.csv", index=False)

df.head()
#df["hour"] = df["created_at"].dt.hour
#df["day_of_week"] = df["created_at"].dt.dayofweek
#df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,text_length,comment_ratio,high_engagement,created_hour,created_day,text_clean
0,cc3048dc-8259-43db-9f90-19e7cc53ea15,it’s happening!,2025-12-03 15:27:21.559000+00:00,92,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,XWllrpGP,15,0.0,0,15,2025-12-03,it’s happening!
1,f293eaa5-c405-405d-9b07-bfe84a7626de,I feel a sense of accomplishment when I have more minutes listened than ppl ...,2025-12-03 14:49:19.375000+00:00,86,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,0VHjUus2,121,0.0,0,14,2025-12-03,i feel a sense of accomplishment when i have more minutes listened than ppl ...
2,ac5887f9-1ed2-4610-a138-31eb6535065d,Life would be so much better if they had diet coke on campus,2025-12-03 13:53:26.506000+00:00,58,5,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,oNVugCvN,60,0.084746,1,13,2025-12-03,life would be so much better if they had diet coke on campus
3,fcc8b55f-319b-4748-af40-ec39ca98c8eb,They won’t bat an eye at the sex toy you ordered dw,2025-12-03 02:33:16.329000+00:00,149,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,JQe0aPio,51,0.0,0,2,2025-12-03,they won’t bat an eye at the sex toy you ordered dw
4,7926355f-f26c-4531-a026-a4251d5a39fe,Can raise tuition but can’t even put those damn lights on the Chambers Trees,2025-12-03 03:07:56.006000+00:00,133,1,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,rIGwEFCb,76,0.007463,0,3,2025-12-03,can raise tuition but can’t even put those damn lights on the chambers trees
