In [12]:
import os
import re
import json
import pandas as pd
import numpy as np
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

In [13]:

# Reading in the data and visualizing it in an accessible manner
df = pd.read_json('../json_files/merged_file.json')
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.max_colwidth", 80)  # or None to see full text

# Cleaning data
LABEL_COL = "id" 
TEXT_COL = "text"
df = df[df[LABEL_COL].notna() & df[TEXT_COL].notna()]

df.head()

def preprocess_text(t: str) -> str:
    t = t.lower()
    
    # remove URLs
    t = re.sub(r"http\S+|www\S+", "", t)
    
    # remove user mentions (Reddit / Twitter style)
    t = re.sub(r"u\/\w+|@\w+", "", t)
    
    # remove extra whitespace
    t = re.sub(r"\s+", " ", t).strip()
    
    return t

TEXT_COL = "text"
df["text_clean"] = df[TEXT_COL].apply(preprocess_text)

REPLY_COL = "comment_count"
UPVOTE_COL = "vote_total"
df["comment_upvote_ratio"] = df[REPLY_COL] / (df[UPVOTE_COL] + 1)


# Adding sentiment analysis
df["sentiment"] = df["text"].apply(lambda x:TextBlob(x).sentiment.polarity)
nltk.download("vader_lexicon")
vader = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/DAVIDSON/ridoctor/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
def vader_features(text):
    scores = vader.polarity_scores(text)
    return pd.Series({
        "vader_neg": scores["neg"],
        "vader_neu": scores["neu"],
        "vader_pos": scores["pos"],
        "vader_compound": scores["compound"],
    })

vader_df = df["text_clean"].apply(vader_features)
df = pd.concat([df, vader_df], axis=1)



In [15]:
# Adding conflict word count
CONFLICT_WORDS = [
    "but", "however", "actually", "wrong", "disagree",
    "no", "not", "never", "false"
]

def conflict_features(text):
    tokens = text.split()
    return pd.Series({
        "conflict_count": sum(t in CONFLICT_WORDS for t in tokens),
        "has_conflict": int(any(t in CONFLICT_WORDS for t in tokens)),
        "exclamations": text.count("!"),
        "questions": text.count("?"),
        "all_caps_ratio": sum(w.isupper() for w in tokens) / (len(tokens) + 1)
    })

conflict_df = df["text_clean"].apply(conflict_features)
df = pd.concat([df, conflict_df], axis=1)


df["abs_vader_compound"] = df["vader_compound"].abs()
df["abs_sentiment"] = df["sentiment"].abs()  # TextBlob polarity

In [16]:
DISAGREE_WORDS = [
    "disagree", "wrong", "false", "misleading",
    "no", "not", "never", "nonsense", "ridiculous"
]

def disagreement_features(text):
    tokens = text.split()
    count = sum(w in tokens for w in DISAGREE_WORDS)
    return pd.Series({
        "disagree_count": count,
        "has_disagree": int(count > 0)
    })

disagree_df = df["text_clean"].apply(disagreement_features)
df = pd.concat([df, disagree_df], axis=1)


In [17]:
df["hour"] = df["created_at"].dt.hour
df["day_of_week"] = df["created_at"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

df["post_length"] = df["text_clean"].str.split().str.len()

df["first_person_count"] = df["text_clean"].str.count(
    r"\b(i|me|my|mine|we|us|our|ours)\b"
)

df["second_person_count"] = df["text_clean"].str.count(
    r"\b(you|your|yours|u)\b"
)

df["first_person_ratio"] = df["first_person_count"] / (df["post_length"] + 1)
df["second_person_ratio"] = df["second_person_count"] / (df["post_length"] + 1)


In [18]:
# Pass in as CSV file
df.to_csv("../csv_files/reddit_controversal_df_features.csv", index=False)
print("CSV created, locate in csv_files folder")