In [1]:
import os
import re
import json
import pandas as pd
import numpy as np

In [2]:
# Reading in the data and visualizing it in an accessible manner
df = pd.read_json('../json_files/merged_file.json')
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.max_colwidth", 80)  # or None to see full text

In [3]:
# Cleaning data

LABEL_COL = "id" 
TEXT_COL = "text"
df = df[df[LABEL_COL].notna() & df[TEXT_COL].notna()]


def preprocess_text(t: str) -> str:
    t = t.lower()
    
    # remove URLs
    t = re.sub(r"http\S+|www\S+", "", t)
    
    # remove user mentions (Reddit / Twitter style)
    t = re.sub(r"u\/\w+|@\w+", "", t)
    
    # remove extra whitespace
    t = re.sub(r"\s+", " ", t).strip()
    
    return t


df["text_clean"] = df[TEXT_COL].apply(preprocess_text)


In [4]:
REPLY_COL = "comment_count"
UPVOTE_COL = "vote_total"
df["comment_upvote_ratio"] = df[REPLY_COL] / (df[UPVOTE_COL] + 1)
threshold = df["comment_upvote_ratio"].quantile(0.75)
df["high_engagement"] = (df["comment_upvote_ratio"] >= threshold).astype(int)
df = df.drop(columns=["controversial_flag"], errors="ignore")
df["hour"] = df["created_at"].dt.hour
df["day_of_week"] = df["created_at"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)


In [5]:
# Pass in as CSV file
df.to_csv("../csv_files/engagement_reddit.csv", index=False)
print("CSV created, locate in csv_files folder")