In [3]:
import os
import re
import json
import pandas as pd
import numpy as np


In [4]:
df = pd.read_csv("df_features.csv")

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix

In [6]:
df.head()

Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,controversial_flag,text_clean,comment_upvote_ratio,sentiment
0,1pl1fvp,Hiker rescued after being stuck in quicksand f...,2025-12-12 19:54:28+00:00,152,35,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,HPqSy4as,0,hiker rescued after being stuck in quicksand f...,0.228758,0.0
1,1pkzill,EU agrees to indefinitely freeze Russian asset...,2025-12-12 18:36:28+00:00,1550,43,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,ivJCOQ3f,0,eu agrees to indefinitely freeze russian asset...,0.027724,0.0
2,1pkyuq1,Health Experts Slam Possible FDA ‘Black Box’ W...,2025-12-12 18:10:19+00:00,510,62,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,PdAmBfSd,0,health experts slam possible fda ‘black box’ w...,0.121331,-0.083333
3,1pkypnv,Iran arrests Nobel Peace Prize laureate Narges...,2025-12-12 18:04:43+00:00,217,9,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,xImO7S8o,0,iran arrests nobel peace prize laureate narges...,0.041284,0.0
4,1pkyj4z,Fired Michigan football coach Sherrone Moore i...,2025-12-12 17:57:57+00:00,350,38,Anonymous,0dab5baa-0931-4d9c-bbdf-0de758fca36f,lGXpOMcU,0,fired michigan football coach sherrone moore i...,0.108262,0.0


In [7]:
TEXT_COL = "text_clean"
LABEL_COL = "controversial_flag"
# Set aside numeric features
NUM_COLS = [
    "comment_upvote_ratio", "sentiment"   
]

use_cols = [TEXT_COL, LABEL_COL] + NUM_COLS
df_model = df[use_cols].dropna().copy()

df_model.head()


Unnamed: 0,text_clean,controversial_flag,comment_upvote_ratio,sentiment
0,hiker rescued after being stuck in quicksand f...,0,0.228758,0.0
1,eu agrees to indefinitely freeze russian asset...,0,0.027724,0.0
2,health experts slam possible fda ‘black box’ w...,0,0.121331,-0.083333
3,iran arrests nobel peace prize laureate narges...,0,0.041284,0.0
4,fired michigan football coach sherrone moore i...,0,0.108262,0.0


In [8]:
X_train_df, X_test_df = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model[LABEL_COL]
)

y_train = X_train_df[LABEL_COL].astype(int).values
y_test  = X_test_df[LABEL_COL].astype(int).values


In [9]:
tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_text = tfidf.fit_transform(X_train_df[TEXT_COL])
X_test_text  = tfidf.transform(X_test_df[TEXT_COL])


In [10]:
X_train_num = csr_matrix(X_train_df[NUM_COLS].astype(float).values)
X_test_num  = csr_matrix(X_test_df[NUM_COLS].astype(float).values)


In [11]:
X_train = hstack([X_train_text, X_train_num])
X_test  = hstack([X_test_text, X_test_num])


In [12]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"   # helpful if classes are imbalanced
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
pred = rf.predict(X_test)

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))


Confusion matrix:
 [[882 255]
 [295   9]]

Report:
               precision    recall  f1-score   support

           0      0.749     0.776     0.762      1137
           1      0.034     0.030     0.032       304

    accuracy                          0.618      1441
   macro avg      0.392     0.403     0.397      1441
weighted avg      0.598     0.618     0.608      1441



In [15]:
# Directly address recall: there was a bad bias to the munority class.
# 295 false negatives and 9 true negatives
# Class 0 reasonably well separated, but almost never predicting class 1
# Biased towards the majority class
# Changing decision threshold, and stronger class weighting

In [1]:
probs = rf.predict_proba(X_test)[:, 1]

for t in [0.5, 0.4, 0.3, 0.25]:
    pred = (probs >= t).astype(int)
    print(f"\nThreshold = {t}")
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred, digits=3))


NameError: name 'rf' is not defined

In [20]:
# We can see here that the model does not have an easy way of distinguishing what is controversial. 
# It can only detect controversial comments weakly
# THe issue isn't the threshold, the model is underfitting due to insufficient features.
# High bias caused by weak feature representations

# Going back into data cleaning to add more sentiment analysis (Vader) and counting conflict words

In [21]:
df = pd.read_csv("reddit_df_features.csv")

In [23]:
TEXT_COL = "text_clean"
LABEL_COL = "controversial_flag"
# Set aside numeric features
NUM_COLS = [
    "comment_upvote_ratio", "sentiment", "vader_neg", "vader_neu", "vader_pos", "vader_compound",
    "conflict_count", "has_conflict",
    "exclamations", "questions", "all_caps_ratio", "comment_count", "vote_total"  
]

use_cols = [TEXT_COL, LABEL_COL] + NUM_COLS
df_model = df[use_cols].dropna().copy()

df_model.head()

Unnamed: 0,text_clean,controversial_flag,comment_upvote_ratio,sentiment,vader_neg,vader_neu,vader_pos,vader_compound,conflict_count,has_conflict,exclamations,questions,all_caps_ratio,comment_count,vote_total
0,hiker rescued after being stuck in quicksand f...,0,0.228758,0.0,0.168,0.693,0.139,0.1027,0.0,0.0,0.0,0.0,0.0,35,152
1,eu agrees to indefinitely freeze russian asset...,0,0.027724,0.0,0.164,0.526,0.309,0.0516,0.0,0.0,0.0,0.0,0.0,43,1550
2,health experts slam possible fda ‘black box’ w...,0,0.121331,-0.083333,0.357,0.643,0.0,-0.6124,0.0,0.0,0.0,0.0,0.0,62,510
3,iran arrests nobel peace prize laureate narges...,0,0.041284,0.0,0.156,0.323,0.522,0.7783,0.0,0.0,0.0,0.0,0.0,9,217
4,fired michigan football coach sherrone moore i...,0,0.108262,0.0,0.375,0.625,0.0,-0.6597,0.0,0.0,0.0,0.0,0.0,38,350


In [24]:
X_train_df, X_test_df = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model[LABEL_COL]
)

y_train = X_train_df[LABEL_COL].astype(int).values
y_test  = X_test_df[LABEL_COL].astype(int).values


In [25]:


tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_text = tfidf.fit_transform(X_train_df[TEXT_COL])
X_test_text  = tfidf.transform(X_test_df[TEXT_COL])


In [26]:
X_train_num = csr_matrix(X_train_df[NUM_COLS].astype(float).values)
X_test_num  = csr_matrix(X_test_df[NUM_COLS].astype(float).values)

In [27]:
X_train = hstack([X_train_text, X_train_num])
X_test  = hstack([X_test_text, X_test_num])


In [31]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=5,
    class_weight={0: 1, 1: 5},
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
pred = rf.predict(X_test)

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))


Confusion matrix:
 [[753 384]
 [227  77]]

Report:
               precision    recall  f1-score   support

           0      0.768     0.662     0.711      1137
           1      0.167     0.253     0.201       304

    accuracy                          0.576      1441
   macro avg      0.468     0.458     0.456      1441
weighted avg      0.642     0.576     0.604      1441



In [33]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    preds = (probs >= t).astype(int)
    print(f"\nTHRESHOLD = {t}")
    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds, digits=3))



THRESHOLD = 0.5
[[883 254]
 [285  19]]
              precision    recall  f1-score   support

           0      0.756     0.777     0.766      1137
           1      0.070     0.062     0.066       304

    accuracy                          0.626      1441
   macro avg      0.413     0.420     0.416      1441
weighted avg      0.611     0.626     0.618      1441


THRESHOLD = 0.4
[[810 327]
 [265  39]]
              precision    recall  f1-score   support

           0      0.753     0.712     0.732      1137
           1      0.107     0.128     0.116       304

    accuracy                          0.589      1441
   macro avg      0.430     0.420     0.424      1441
weighted avg      0.617     0.589     0.602      1441


THRESHOLD = 0.3
[[773 364]
 [236  68]]
              precision    recall  f1-score   support

           0      0.766     0.680     0.720      1137
           1      0.157     0.224     0.185       304

    accuracy                          0.584      1441
   macro

In [34]:
# Added more features - polarity extremeness, disagreement features, engagement imbalance

In [63]:
df = pd.read_csv("reddit_df_more_features.csv")
TEXT_COL = "text_clean"
LABEL_COL = "controversial_flag"
# Set aside numeric features
NUM_COLS = [
    "hour", "day_of_week", "is_weekend",
    "comment_upvote_ratio", "sentiment", "vader_neg", "vader_neu", "vader_pos", "vader_compound",
    "conflict_count", "has_conflict", "abs_sentiment", "abs_vader_compound", "disagree_count", "has_disagree",
    "exclamations", "questions", "all_caps_ratio", "comment_count", "vote_total",  
    "post_length",
    "first_person_count", "second_person_count",
    "first_person_ratio", "second_person_ratio"
]

use_cols = [TEXT_COL, LABEL_COL] + NUM_COLS
df_model = df[use_cols].dropna().copy()

df_model.head()

Unnamed: 0,text_clean,controversial_flag,hour,day_of_week,is_weekend,comment_upvote_ratio,sentiment,vader_neg,vader_neu,vader_pos,...,exclamations,questions,all_caps_ratio,comment_count,vote_total,post_length,first_person_count,second_person_count,first_person_ratio,second_person_ratio
0,hiker rescued after being stuck in quicksand f...,0,19,4,0,0.228758,0.0,0.168,0.693,0.139,...,0.0,0.0,0.0,35,152,17,0,0,0.0,0.0
1,eu agrees to indefinitely freeze russian asset...,0,18,4,0,0.027724,0.0,0.164,0.526,0.309,...,0.0,0.0,0.0,43,1550,12,0,0,0.0,0.0
2,health experts slam possible fda ‘black box’ w...,0,18,4,0,0.121331,-0.083333,0.357,0.643,0.0,...,0.0,0.0,0.0,62,510,11,0,0,0.0,0.0
3,iran arrests nobel peace prize laureate narges...,0,18,4,0,0.041284,0.0,0.156,0.323,0.522,...,0.0,0.0,0.0,9,217,10,0,0,0.0,0.0
4,fired michigan football coach sherrone moore i...,0,17,4,0,0.108262,0.0,0.375,0.625,0.0,...,0.0,0.0,0.0,38,350,11,0,0,0.0,0.0


In [64]:
X_train_df, X_test_df = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model[LABEL_COL]
)

y_train = X_train_df[LABEL_COL].astype(int).values
y_test  = X_test_df[LABEL_COL].astype(int).values

tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_text = tfidf.fit_transform(X_train_df[TEXT_COL])
X_test_text  = tfidf.transform(X_test_df[TEXT_COL])


In [65]:
X_train_num = csr_matrix(X_train_df[NUM_COLS].astype(float).values)
X_test_num  = csr_matrix(X_test_df[NUM_COLS].astype(float).values)

In [66]:
X_train = hstack([X_train_text, X_train_num])
X_test  = hstack([X_test_text, X_test_num])


In [67]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=5,
    class_weight={0: 1, 1: 5},
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [68]:
pred = rf.predict(X_test)

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))


Confusion matrix:
 [[728 409]
 [222  82]]

Report:
               precision    recall  f1-score   support

           0      0.766     0.640     0.698      1137
           1      0.167     0.270     0.206       304

    accuracy                          0.562      1441
   macro avg      0.467     0.455     0.452      1441
weighted avg      0.640     0.562     0.594      1441



In [60]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    preds = (probs >= t).astype(int)
    print(f"\nTHRESHOLD = {t}")
    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds, digits=3))



THRESHOLD = 0.5
[[883 254]
 [285  19]]
              precision    recall  f1-score   support

           0      0.756     0.777     0.766      1137
           1      0.070     0.062     0.066       304

    accuracy                          0.626      1441
   macro avg      0.413     0.420     0.416      1441
weighted avg      0.611     0.626     0.618      1441


THRESHOLD = 0.4
[[810 327]
 [265  39]]
              precision    recall  f1-score   support

           0      0.753     0.712     0.732      1137
           1      0.107     0.128     0.116       304

    accuracy                          0.589      1441
   macro avg      0.430     0.420     0.424      1441
weighted avg      0.617     0.589     0.602      1441


THRESHOLD = 0.3
[[773 364]
 [236  68]]
              precision    recall  f1-score   support

           0      0.766     0.680     0.720      1137
           1      0.157     0.224     0.185       304

    accuracy                          0.584      1441
   macro

In [42]:
# Despite extensive feature engineering and threshold tuning, the model demonstrates that controversial comments are difficult to identify from textual and sentiment features alone. Recall can be increased substantially by lowering the decision threshold, but this comes at the cost of precision, indicating that controversy is not a strongly separable class under content-only representations.

In [None]:
# Adding more features based on Cornell paper (Hessel and Lee)