In [1]:
import json
import pandas as pd
from pathlib import Path
from textblob import TextBlob
import re
import emoji

In [4]:
data = []

with open("../json_files/posts.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue  # skip blank lines
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping bad JSON on line {i}: {e}")


# Cleaning data
#df.head()

df = pd.DataFrame(data)

TEXT_COL = "text"
LABEL_COL = "id" 
REPLY_COL = "comment_count"
UPVOTE_COL = "vote_total"

# Drop missing text/label
df = df[df[LABEL_COL].notna() & df[TEXT_COL].notna()].copy()

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # dingbats
        "\U0001F900-\U0001F9FF"  # supplemental symbols
        "\U0001FA70-\U0001FAFF"  # extended symbols
        "\U00002600-\U000026FF"  # miscellaneous symbols
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)



def preprocess_text(t: str) -> str:
    t = t.lower()
    t = remove_emojis(t)
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"u\/\w+|@\w+", "", t)
    t = re.sub(r"[^a-z\s]", " ", t)  # keep letters only
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["text_clean"] = df[TEXT_COL].apply(preprocess_text)
df["text_length"] = df["text_clean"].str.len()
df["sentiment"] = df["text_clean"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["comment_ratio"] = df[REPLY_COL] / (df[UPVOTE_COL] + 1)
threshold = df["comment_ratio"].quantile(0.75)
df["high_engagement"] = (df["comment_ratio"] >= threshold).astype(int)
df["created_at"] = pd.to_datetime(df["created_at"], errors='coerce')  # safe parsing
df = df.dropna(subset=["created_at"])
df["created_hour"] = df["created_at"].dt.hour
df["created_day"] = df["created_at"].dt.date
df["day_of_week"] = df["created_at"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)
df["emoji_count"] = df[TEXT_COL].apply(lambda x: len(x) - len(remove_emojis(x)))
df = df[df["text_clean"].str.strip() != ""].copy()



In [5]:

df.to_csv("../csv_files/emoji_test.csv", index=False)

df.head()
#df["hour"] = df["created_at"].dt.hour
#df["day_of_week"] = df["created_at"].dt.dayofweek
#df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,text_clean,text_length,sentiment,comment_ratio,high_engagement,created_hour,created_day,day_of_week,is_weekend,emoji_count
0,cc3048dc-8259-43db-9f90-19e7cc53ea15,it’s happening!,2025-12-03 15:27:21.559000+00:00,92,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,XWllrpGP,it s happening,14,0.0,0.0,0,15,2025-12-03,2,0,0
1,f293eaa5-c405-405d-9b07-bfe84a7626de,I feel a sense of accomplishment when I have m...,2025-12-03 14:49:19.375000+00:00,86,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,0VHjUus2,i feel a sense of accomplishment when i have m...,121,0.5,0.0,0,14,2025-12-03,2,0,0
2,ac5887f9-1ed2-4610-a138-31eb6535065d,Life would be so much better if they had diet ...,2025-12-03 13:53:26.506000+00:00,58,5,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,oNVugCvN,life would be so much better if they had diet ...,60,0.5,0.084746,1,13,2025-12-03,2,0,0
3,fcc8b55f-319b-4748-af40-ec39ca98c8eb,They won’t bat an eye at the sex toy you order...,2025-12-03 02:33:16.329000+00:00,149,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,JQe0aPio,they won t bat an eye at the sex toy you order...,51,0.0,0.0,0,2,2025-12-03,2,0,0
4,7926355f-f26c-4531-a026-a4251d5a39fe,Can raise tuition but can’t even put those dam...,2025-12-03 03:07:56.006000+00:00,133,1,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,rIGwEFCb,can raise tuition but can t even put those dam...,76,0.0,0.007463,0,3,2025-12-03,2,0,0
