In [118]:
import os
import re
import json
import pandas as pd
import numpy as np

In [134]:
df = pd.read_csv("../../yikyak_metadata.csv")

In [133]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix

In [135]:
df.head()

Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,text_clean,text_length,...,has_disagree,conflict_count,has_conflict,exclamations,questions,all_caps_ratio,vader_neg,vader_neu,vader_pos,vader_compound
0,d50e6e42-323e-404b-a349-bec42e614b19,"Jarvis, Iâ€™m running low on Yakarma",2025-12-14 06:32:26.933000+00:00,-3,3,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,Fpp9kuO3,"jarvis, iâ€™m running low on yakarma",34,...,0,0.0,0.0,0.0,0.0,0.0,0.296,0.704,0.0,-0.2732
1,c2a438d4-eac2-4a2b-975b-bdf3930c809b,Pray for Brown ðŸ¤Ž,2025-12-14 05:08:52.476000+00:00,21,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,2JDyMyq5,pray for brown brown_heart,16,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.566,0.434,0.3182
2,4e015ca3-970e-420e-bf09-f5c115791696,The bits are on a generational run right now b...,2025-12-14 05:02:41.856000+00:00,1,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,R3h2lBvx,the bits are on a generational run right now b...,100,...,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,91daece4-43c6-4ea0-94b3-b5d2d0c327ae,Thank god Iâ€™m moving out rn ðŸ˜°,2025-12-14 04:33:28.936000+00:00,3,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,sJ3N2nZO,thank god iâ€™m moving out rn anxious_face_with_...,29,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.521,0.479,0.5574
4,9b2de0fe-11d2-465f-80b5-4a996d8d3c5f,BRING BACK TEA APP. BRING BACK TEA APP. BRING ...,2025-12-14 04:02:22.066000+00:00,20,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,7Jx4kSo0,bring back tea app. bring back tea app. bring ...,86,...,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [136]:
print(df.shape)

(19126, 29)


In [137]:
TEXT_COL = "text_clean"
LABEL_COL = "high_engagement"
# Set aside numeric features
NUM_COLS = [
     "text_length", "created_hour", "first_person_count", "second_person_count", "first_person_ratio", "second_person_ratio",
    "disagree_count", "has_disagree", "conflict_count", "has_conflict", "exclamations", "questions", "vader_neg", "vader_neu", "vader_pos", "vader_compound" 
]

use_cols = [TEXT_COL, LABEL_COL] + NUM_COLS
df_model = df[use_cols].dropna().copy()

df_model.head()

Unnamed: 0,text_clean,high_engagement,text_length,created_hour,first_person_count,second_person_count,first_person_ratio,second_person_ratio,disagree_count,has_disagree,conflict_count,has_conflict,exclamations,questions,vader_neg,vader_neu,vader_pos,vader_compound
0,"jarvis, iâ€™m running low on yakarma",0,34,6,1,0,0.028571,0.0,0,0,0.0,0.0,0.0,0.0,0.296,0.704,0.0,-0.2732
1,pray for brown brown_heart,0,16,5,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.566,0.434,0.3182
2,the bits are on a generational run right now b...,0,100,5,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,thank god iâ€™m moving out rn anxious_face_with_...,0,29,4,1,0,0.033333,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.521,0.479,0.5574
4,bring back tea app. bring back tea app. bring ...,0,86,4,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [138]:
# Replace infinities with NaN, then fill NaN
df_model[NUM_COLS] = df_model[NUM_COLS].replace([np.inf, -np.inf], np.nan)
df_model[NUM_COLS] = df_model[NUM_COLS].fillna(0)


In [139]:

X = df.drop(columns=["high_engagement"])
y = df["high_engagement"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [140]:
# ---- UNDERSAMPLING BLOCK ----
train_df = X_train.copy()
train_df["high_engagement"] = y_train.values

df_pos = train_df[train_df["high_engagement"] == 1]
df_neg = train_df[train_df["high_engagement"] == 0]

ratio = 4  # 4 negatives per positive
df_neg_under = df_neg.sample(
    n=ratio * len(df_pos),
    random_state=42
)

train_under = pd.concat([df_pos, df_neg_under]).sample(
    frac=1,
    random_state=42
)

X_train_under = train_under.drop(columns=["high_engagement"])
y_train_under = train_under["high_engagement"]
# -----------------------------


In [141]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    stop_words="english"
)

In [142]:
# After undersampling:
X_train_under_df = X_train_under.copy()

# --- TEXT FEATURES (recompute for undersampled train) ---
# assuming your text column is called "text" (change if yours differs)
X_train_text_under = vectorizer.fit_transform(X_train_under_df["text"])
X_test_text = vectorizer.transform(X_test_df["text_clean"])


In [143]:
X_train_under_df = X_train_under.copy()


In [144]:
# numeric features
X_train_num = csr_matrix(X_train_under_df[NUM_COLS].astype(float).values)
X_test_num  = csr_matrix(X_test_df[NUM_COLS].astype(float).values)

# combine text + numeric
X_train = hstack([X_train_text_under, X_train_num])
X_test  = hstack([X_test_text, X_test_num])


In [145]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train_under)

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [147]:
print("X_test rows:", X_test.shape[0])
print("y_test rows:", len(y_test))
print("pred rows:", len(pred))


X_test rows: 3825
y_test rows: 3826
pred rows: 3825


In [149]:
# safest: rebuild y_test from the original y and X_test_df index
y_test_aligned = y.loc[X_test_df.index]


In [151]:
pred = rf.predict(X_test)

print("Confusion matrix:\n", confusion_matrix(y_test_aligned, pred))
print("\nReport:\n", classification_report(y_test_aligned, pred, digits=3))


Confusion matrix:
 [[2692  751]
 [ 189  193]]

Report:
               precision    recall  f1-score   support

           0      0.934     0.782     0.851      3443
           1      0.204     0.505     0.291       382

    accuracy                          0.754      3825
   macro avg      0.569     0.644     0.571      3825
weighted avg      0.861     0.754     0.795      3825



In [108]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=500,
    class_weight="balanced",          # or "balanced_subsample"
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [109]:
pred = clf.predict(X_test)

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))


Confusion matrix:
 [[3440    3]
 [ 381    1]]

Report:
               precision    recall  f1-score   support

           0      0.900     0.999     0.947      3443
           1      0.250     0.003     0.005       382

    accuracy                          0.900      3825
   macro avg      0.575     0.501     0.476      3825
weighted avg      0.835     0.900     0.853      3825



In [110]:
from imblearn.ensemble import BalancedRandomForestClassifier

clf = BalancedRandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [111]:
pred = clf.predict(X_test)

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))


Confusion matrix:
 [[3001  442]
 [ 256  126]]

Report:
               precision    recall  f1-score   support

           0      0.921     0.872     0.896      3443
           1      0.222     0.330     0.265       382

    accuracy                          0.818      3825
   macro avg      0.572     0.601     0.581      3825
weighted avg      0.852     0.818     0.833      3825

