In [2]:
import os
import re
import json
import pandas as pd
import numpy as np

In [5]:

# Reading in the data and visualizing it in an accessible manner
df = pd.read_json('../json_files/merged_file.json')
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.max_colwidth", 80)  # or None to see full text

#df.head()

In [6]:
# Cleaning data

LABEL_COL = "id" 
TEXT_COL = "text"
df = df[df[LABEL_COL].notna() & df[TEXT_COL].notna()]


def preprocess_text(t: str) -> str:
    t = t.lower()
    
    # remove URLs
    t = re.sub(r"http\S+|www\S+", "", t)
    
    # remove user mentions (Reddit / Twitter style)
    t = re.sub(r"u\/\w+|@\w+", "", t)
    
    # remove extra whitespace
    t = re.sub(r"\s+", " ", t).strip()
    
    return t


df["text_clean"] = df[TEXT_COL].apply(preprocess_text)

df.head()



Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,controversial_flag,text_clean
0,1pl1fvp,Hiker rescued after being stuck in quicksand for hours amid freezing temps i...,2025-12-12 19:54:28+00:00,152,35,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,HPqSy4as,0,hiker rescued after being stuck in quicksand for hours amid freezing temps i...
1,1pkzill,"EU agrees to indefinitely freeze Russian assets, removing obstacle to Ukrain...",2025-12-12 18:36:28+00:00,1550,43,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,ivJCOQ3f,0,"eu agrees to indefinitely freeze russian assets, removing obstacle to ukrain..."
2,1pkyuq1,Health Experts Slam Possible FDA ‘Black Box’ Warning for COVID Vaccines,2025-12-12 18:10:19+00:00,510,62,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,PdAmBfSd,0,health experts slam possible fda ‘black box’ warning for covid vaccines
3,1pkypnv,"Iran arrests Nobel Peace Prize laureate Narges Mohammadi, supporters say",2025-12-12 18:04:43+00:00,217,9,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,xImO7S8o,0,"iran arrests nobel peace prize laureate narges mohammadi, supporters say"
4,1pkyj4z,Fired Michigan football coach Sherrone Moore is charged with three crimes,2025-12-12 17:57:57+00:00,350,38,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,lGXpOMcU,0,fired michigan football coach sherrone moore is charged with three crimes


In [7]:
REPLY_COL = "comment_count"
UPVOTE_COL = "vote_total"
df["comment_upvote_ratio"] = df[REPLY_COL] / (df[UPVOTE_COL] + 1)
threshold = df["comment_upvote_ratio"].quantile(0.75)
df["high_engagement"] = (df["comment_upvote_ratio"] >= threshold).astype(int)
df = df.drop(columns=["controversial_flag"], errors="ignore")
df["hour"] = df["created_at"].dt.hour
df["day_of_week"] = df["created_at"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# Adding a scaled comment to upvote ratio 

df.head()
# Pass in as CSV file
df.to_csv("../csv_files/engagement_reddit.csv", index=False)

