In [None]:
#Import & Load Raw Reddit Data
import json
import pandas as pd
import re
from pathlib import Path
from textblob import TextBlob

# Load the raw Reddit posts
raw_file = Path("../data/reddit_raw/reddit_posts.json")

with open(raw_file, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df.shape, df.columns


((100, 8),
 Index(['subreddit', 'title', 'score', 'id', 'url', 'num_comments', 'created',
        'text'],
       dtype='object'))

In [None]:
#Text Cleaning Function
def clean_text(text):
    if not text:
        return ""
    text = re.sub(r"http\S+", "", text)                 # Remove URLs
    text = re.sub(r"[^A-Za-z0-9\s]+", "", text)         # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()            # Normalize whitespace
    return text


In [3]:
#Apply Cleaning and Drop Empty Posts
# Combine title and text for better sentiment context
df["full_text"] = df["title"].fillna("") + " " + df["text"].fillna("")
df["full_text"] = df["full_text"].apply(clean_text)

# Remove empty rows
df = df[df["full_text"].str.strip() != ""]
df.reset_index(drop=True, inplace=True)
df[["subreddit", "full_text"]].head()


Unnamed: 0,subreddit,full_text
0,apple,Daily Advice Thread April 13 2025 Welcome to t...
1,apple,Gurman iPadOS 19 to be more like macOS in majo...
2,apple,US Commerce Secretary says exempted electronic...
3,apple,My app became App of the Day this week heres t...
4,apple,Apple Readies Pair of Headsets While Still Loo...


In [4]:
#Sentiment Analysis with Textblob
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0.1:
        return "positive"
    elif polarity < -0.1:
        return "negative"
    else:
        return "neutral"

df["sentiment"] = df["full_text"].apply(get_sentiment)
df[["full_text", "sentiment"]].head()

Unnamed: 0,full_text,sentiment
0,Daily Advice Thread April 13 2025 Welcome to t...,positive
1,Gurman iPadOS 19 to be more like macOS in majo...,positive
2,US Commerce Secretary says exempted electronic...,neutral
3,My app became App of the Day this week heres t...,positive
4,Apple Readies Pair of Headsets While Still Loo...,neutral


In [5]:
#Save Sentiment Results
output_dir = Path("../data/reddit_cleaned/")
output_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(output_dir / "reddit_sentiment.csv", index=False)
print(f"✅ Sentiment-tagged data saved to reddit_sentiment.csv")


✅ Sentiment-tagged data saved to reddit_sentiment.csv
