In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from textblob import TextBlob
import re
import emoji

In [3]:
data = []

with open("../more_yikyak_posts.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue  # skip blank lines
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping bad JSON on line {i}: {e}")


# Cleaning data
#df.head()

df = pd.read_json("../more_yikyak_posts.jsonl", lines=True)
LABEL_COL = "id" 
TEXT_COL = "text"
df = df[df[LABEL_COL].notna() & df[TEXT_COL].notna()]


def preprocess_text(t: str) -> str:

    t = emoji.demojize(t, delimiters=(" ", " ")) #convert emojis to text
    t = t.lower()
    
    # remove URLs
    t = re.sub(r"http\S+|www\S+", "", t)
    
    # remove user mentions (Reddit / Twitter style)
    t = re.sub(r"u\/\w+|@\w+", "", t)
    
    # remove extra whitespace
    t = re.sub(r"\s+", " ", t).strip()
    
    return t


df["text_clean"] = df[TEXT_COL].apply(preprocess_text)


In [4]:
df["text_length"] = df["text"].apply(len)
df["comment_ratio"] = df["comment_count"] / (df["vote_total"] + 1)
threshold = df["vote_total"].quantile(0.90)
df["high_engagement"] = (df["vote_total"] >= threshold).astype(int)
df["created_at"] = pd.to_datetime(df["created_at"])
df["created_hour"] = df["created_at"].dt.hour
df["created_day"] = df["created_at"].dt.date

Unnamed: 0,id,text,created_at,vote_total,comment_count,alias,group_id,index_code,text_clean,text_length,comment_ratio,high_engagement,created_hour,created_day
0,d50e6e42-323e-404b-a349-bec42e614b19,"Jarvis, I‚Äôm running low on Yakarma",2025-12-14 06:32:26.933000+00:00,-3,3,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,Fpp9kuO3,"jarvis, i‚Äôm running low on yakarma",34,-1.5,0,6,2025-12-14
1,c2a438d4-eac2-4a2b-975b-bdf3930c809b,Pray for Brown ü§é,2025-12-14 05:08:52.476000+00:00,21,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,2JDyMyq5,pray for brown brown_heart,16,0.0,0,5,2025-12-14
2,4e015ca3-970e-420e-bf09-f5c115791696,The bits are on a generational run right now b...,2025-12-14 05:02:41.856000+00:00,1,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,R3h2lBvx,the bits are on a generational run right now b...,100,0.0,0,5,2025-12-14
3,91daece4-43c6-4ea0-94b3-b5d2d0c327ae,Thank god I‚Äôm moving out rn üò∞,2025-12-14 04:33:28.936000+00:00,3,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,sJ3N2nZO,thank god i‚Äôm moving out rn anxious_face_with_...,29,0.0,0,4,2025-12-14
4,9b2de0fe-11d2-465f-80b5-4a996d8d3c5f,BRING BACK TEA APP. BRING BACK TEA APP. BRING ...,2025-12-14 04:02:22.066000+00:00,20,0,Anonymous,1fcad7b1-fce2-4ae1-bd48-bd1917b62d98,7Jx4kSo0,bring back tea app. bring back tea app. bring ...,86,0.0,0,4,2025-12-14


In [5]:

df["first_person_count"] = df["text_clean"].str.count(
    r"\b(i|me|my|mine|we|us|our|ours)\b"
)

df["second_person_count"] = df["text_clean"].str.count(
    r"\b(you|your|yours|u)\b"
)

df["first_person_ratio"] = df["first_person_count"] / (df["text_length"] + 1)
df["second_person_ratio"] = df["second_person_count"] / (df["text_length"] + 1)

print(df.shape)


(19558, 18)


In [6]:
DISAGREE_WORDS = [
    "disagree", "wrong", "false", "misleading",
    "no", "not", "never", "nonsense", "ridiculous"
]

def disagreement_features(text):
    tokens = text.split()
    count = sum(w in tokens for w in DISAGREE_WORDS)
    return pd.Series({
        "disagree_count": count,
        "has_disagree": int(count > 0)
    })

disagree_df = df["text_clean"].apply(disagreement_features)
df = pd.concat([df, disagree_df], axis=1)

In [7]:
# Adding conflict word count
CONFLICT_WORDS = [
    "but", "however", "actually", "wrong", "disagree",
    "no", "not", "never", "false"
]

def conflict_features(text):
    tokens = text.split()
    return pd.Series({
        "conflict_count": sum(t in CONFLICT_WORDS for t in tokens),
        "has_conflict": int(any(t in CONFLICT_WORDS for t in tokens)),
        "exclamations": text.count("!"),
        "questions": text.count("?"),
        "all_caps_ratio": sum(w.isupper() for w in tokens) / (len(tokens) + 1)
    })

conflict_df = df["text_clean"].apply(conflict_features)
df = pd.concat([df, conflict_df], axis=1)


In [8]:
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

def vader_features(text):
    scores = vader.polarity_scores(text)
    return pd.Series({
        "vader_neg": scores["neg"],
        "vader_neu": scores["neu"],
        "vader_pos": scores["pos"],
        "vader_compound": scores["compound"],
    })

vader_df = df["text_clean"].apply(vader_features)
df = pd.concat([df, vader_df], axis=1)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/DAVIDSON/eldevulapally/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
# Dropping no text posts
df = df[df["text"].notna() & (df["text"].str.strip() != "")]

(19232, 29)


In [11]:
# Check duplicates in raw text
df["text"].duplicated().any()

# Or check duplicates in cleaned text
df["text_clean"].duplicated().any()


np.True_

In [12]:
df["text_clean"].duplicated().sum()


np.int64(106)

In [13]:
df_no_dupes = df.drop_duplicates(subset=["text_clean"], keep="first")


In [14]:
df = df_no_dupes

In [15]:


# Separate classes
df_0 = df[df["high_engagement"] == 0]
df_1 = df[df["high_engagement"] == 1]

# Undersample class 0 to match class 1 size
df_0_under = df_0.sample(n=len(df_1), random_state=42)

# Combine and shuffle
df_under = pd.concat([df_0_under, df_1]).sample(frac=1, random_state=42)


In [16]:
# --- Burstiness features (past 2 hours) ---
# Put this AFTER: df["created_at"] = pd.to_datetime(df["created_at"])

# Ensure timezone handling is consistent
df["created_at"] = pd.to_datetime(df["created_at"], utc=True, errors="coerce")
df = df.dropna(subset=["created_at"])

# Sort so rolling/shift only uses the past (prevents leakage)
df = df.sort_values("created_at").reset_index(drop=True)

# Use a time index for rolling windows
df = df.set_index("created_at")

WINDOW = "2h"

# 1) Global activity in prior 2 hours
# rolling count includes the current row, so subtract 1 to get "previous"
df["posts_prev_2h_all"] = df["id"].rolling(WINDOW).count() - 1
df["posts_prev_2h_all"] = df["posts_prev_2h_all"].clip(lower=0).fillna(0)

# 2) Group activity in prior 2 hours
# rolling per group, again subtract 1 to exclude current post
df["posts_prev_2h_group"] = (
    df.groupby("group_id")["id"]
      .rolling(WINDOW)
      .count()
      .reset_index(level=0, drop=True)
      - 1
)
df["posts_prev_2h_group"] = df["posts_prev_2h_group"].clip(lower=0).fillna(0)

# 3) Relative burstiness: "is the group unusually busy compared to the whole app?"
df["rel_posts_prev_2h"] = df["posts_prev_2h_group"] / (df["posts_prev_2h_all"] + 1)

# 4) Simple "burst flag" within each group
# Compare current 2h activity to group's typical 2h activity (rolling mean/std over last 30 days)
# If you don't have a full year in every group, this still behaves fine due to min_periods.
BASELINE = "30d"
grp_roll_mean = (
    df.groupby("group_id")["posts_prev_2h_group"]
      .rolling(BASELINE, min_periods=50)
      .mean()
      .reset_index(level=0, drop=True)
)
grp_roll_std = (
    df.groupby("group_id")["posts_prev_2h_group"]
      .rolling(BASELINE, min_periods=50)
      .std()
      .reset_index(level=0, drop=True)
)

df["burst_z_group"] = (df["posts_prev_2h_group"] - grp_roll_mean) / (grp_roll_std + 1e-6)
df["burst_z_group"] = df["burst_z_group"].replace([np.inf, -np.inf], 0).fillna(0)

# Flag: 1 means "unusually busy right now"
df["burst_flag_group"] = (df["burst_z_group"] >= 1.0).astype(int)

# Return to normal index for saving
df = df.reset_index()


In [17]:
df.to_csv("../yikyak_metadata.csv", index=False)

In [25]:
# --- EDA PLOTS (optional) ---
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def make_eda_plots(df: pd.DataFrame, out_dir: str = "plots", sample_n: int = 3000):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)
    print("Saving plots to:", out.resolve())


    # 1) Correlation heatmap (numeric columns only)
    num = df.select_dtypes(include="number")
    if num.shape[1] >= 2:
        corr = num.corr(numeric_only=True)

        plt.figure(figsize=(8, 5))
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="viridis")
        plt.title("Correlation heatmap (numeric features)")
        plt.tight_layout()
        plt.savefig(out / "corr_heatmap.png", dpi=200)
        plt.close()

    # 2) Pairplot 
    cols = [c for c in ["vote_total", "comment_count", "text_length", "comment_ratio"] if c in df.columns]
    if len(cols) >= 2:
        df_s = df[cols].dropna()
        if len(df_s) > sample_n:
            df_s = df_s.sample(sample_n, random_state=42)

        g = sns.pairplot(df_s, diag_kind="hist")
        g.fig.suptitle("Pairplot (sampled)", y=1.02)
        g.savefig(out / "pairplot.png", dpi=200)
        plt.close("all")

    # 3) Bar plots like your ‚Äúmean Answers/Views by Tag‚Äù
    # Example: mean vote_total and mean comment_count by created_hour
    if "created_hour" in df.columns:
        means = df.groupby("created_hour")[["vote_total", "comment_count"]].mean(numeric_only=True)

        plt.figure(figsize=(10, 4))
        means["comment_count"].plot(kind="bar")
        plt.ylabel("Mean comment_count")
        plt.xlabel("created_hour")
        plt.tight_layout()
        plt.savefig(out / "mean_comment_count_by_hour.png", dpi=200)
        plt.close()

        plt.figure(figsize=(10, 4))
        means["vote_total"].plot(kind="bar")
        plt.ylabel("Mean vote_total")
        plt.xlabel("created_hour")
        plt.tight_layout()
        plt.savefig(out / "mean_vote_total_by_hour.png", dpi=200)
        plt.close()

In [26]:
 make_eda_plots(df)

Saving plots to: /home/DAVIDSON/eldevulapally/Workspace/CSC371/final_ML_fall25/data_cleaning/plots


In [27]:
for c in ["vote_total", "comment_count", "posts_prev_2h_all"]:
    df[f"log_{c}"] = np.log1p(df[c])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [28]:
df["comment_ratio"] = df["comment_count"] / (df["vote_total"] + 1)
df["comment_ratio"] = df["comment_ratio"].clip(0, 5)


In [30]:
for c in ["vote_total", "comment_count", "posts_prev_2h_all"]:
    safe = df[c].clip(lower=0)        # remove negatives
    df[f"log_{c}"] = np.log1p(safe)


In [31]:
df.to_csv("../yikyak_metadata.csv", index=False)