In [10]:
import pandas as pd
import re

In [11]:
wsb = pd.read_csv("data/wsb.csv")
wsb

Unnamed: 0,register_index,post_id,comment_id,author,datetime,title,url,score,comments,text,author_post_karma,tag
0,14b78hkjoe86nf,14b78hk,joe86nf,scott_jr,2023-06-16 20:36:55,,,1.0,,Watch til 1 10,32102.0,Meme
1,14b71m2post,14b71m2,,merakibret,2023-06-16 20:24:01,I had my first ever big success with options t...,https://www.reddit.com/r/wallstreetbets/commen...,8.0,6.0,Entered an Iron Condor on ADBE yesterday at 45...,343.0,Gain
2,14b71m2joe6du9,14b71m2,joe6du9,VisualMod,2023-06-16 20:24:07,,,1.0,,User Report Tota...,725083.0,Gain
3,14b71m2joe6een,14b71m2,joe6een,VisualMod,2023-06-16 20:24:13,,,2.0,,That was a very wise move,725083.0,Gain
4,14b71m2joe7yy4,14b71m2,joe7yy4,DreamcatcherEgg,2023-06-16 20:35:23,,,2.0,,All you have to do is repeat this same winning...,6088.0,Gain
...,...,...,...,...,...,...,...,...,...,...,...,...
3033535,1j96owemhd8ajs,1j96owe,mhd8ajs,jarail,2025-03-12 11:25:28,,,4.0,,Hopefully he made the whole story up,,Discussion
3033536,1j96owemhcapeb,1j96owe,mhcapeb,South_Age974,2025-03-12 05:46:48,,,2.0,,holy f,,Discussion
3033537,1j96owemhjt5np,1j96owe,mhjt5np,PickinLosers,2025-03-13 11:49:05,,,2.0,,I like to call them PDFs public domain fries,,Discussion
3033538,1j96owemhex8ls,1j96owe,mhex8ls,The_Whackest,2025-03-12 17:04:15,,,1.0,,Singsongy Some make you laugh and others ma...,,Discussion


In [12]:
wsb.info()
wsb.head()
wsb.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033540 entries, 0 to 3033539
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   register_index     object 
 1   post_id            object 
 2   comment_id         object 
 3   author             object 
 4   datetime           object 
 5   title              object 
 6   url                object 
 7   score              float64
 8   comments           float64
 9   text               object 
 10  author_post_karma  float64
 11  tag                object 
dtypes: float64(3), object(9)
memory usage: 277.7+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033540 entries, 0 to 3033539
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   register_index     object 
 1   post_id            object 
 2   comment_id         object 
 3   author             object 
 4   datetime           object 
 5   title              object 
 6   url                object 
 7   score              float64
 8   comments           float64
 9   text               object 
 10  author_post_karma  float64
 11  tag                object 
dtypes: float64(3), object(9)
memory usage: 277.7+ MB


(3033540, 12)

In [13]:
# Clean and normalize dataset

# drop uneccessary columns
wsb = wsb.drop(columns=["register_index"])

# convert datetime column to datetime type and normalize to date only
wsb["datetime"] = pd.to_datetime(wsb["datetime"], errors="coerce").dt.normalize()

# convert numeric columns to int type and fill na with 0
numeric_cols = ["score", "comments", "author_post_karma"]
for col in numeric_cols:
    wsb[col] = pd.to_numeric(wsb[col], errors="coerce")
    wsb[col] = wsb[col].fillna(0).astype(int)

# clean and normalize tag column
wsb["tag"] = (
    wsb["tag"].fillna("unknown").str.strip().str.lower().str.replace(r"\s+", "_", regex=True)
)
wsb["tag"] = wsb["tag"].astype("category")

# identify post or comment
wsb["is_post"] = wsb["comment_id"].isna()

wsb.info()
wsb.head()
wsb.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033540 entries, 0 to 3033539
Data columns (total 12 columns):
 #   Column             Dtype         
---  ------             -----         
 0   post_id            object        
 1   comment_id         object        
 2   author             object        
 3   datetime           datetime64[ns]
 4   title              object        
 5   url                object        
 6   score              int64         
 7   comments           int64         
 8   text               object        
 9   author_post_karma  int64         
 10  tag                category      
 11  is_post            bool          
dtypes: bool(1), category(1), datetime64[ns](1), int64(3), object(6)
memory usage: 237.2+ MB


(3033540, 12)

In [14]:
# basic filtering / denoising

# remove obvious bot/moderator authors
bot_users = ["VisualMod", "AutoModerator"]
wsb = wsb[~wsb["author"].isin(bot_users)]

# drop rows with deleted/empty content
garbage_tokens = {"", "[deleted]", "[removed]"}
title_clean = wsb["title"].fillna("").str.strip()
text_clean = wsb["text"].fillna("").str.strip()

title_garbage = title_clean.isin(garbage_tokens)
text_garbage = text_clean.isin(garbage_tokens)

post_mask = wsb["is_post"]
comment_mask = ~wsb["is_post"]

drop_mask = (post_mask & title_garbage & text_garbage) | (comment_mask & text_garbage)
wsb = wsb[~drop_mask]

wsb.shape

(2956091, 12)

In [15]:
# light initial text cleaning
# build canonical text field for NLP
title_component = wsb["title"].fillna("").astype(str)
text_component = wsb["text"].fillna("").astype(str)

post_raw_text = (title_component + "\n\n" + text_component).str.strip()
comment_raw_text = text_component.str.strip()

wsb["raw_text"] = post_raw_text.where(wsb["is_post"], comment_raw_text)
wsb = wsb[wsb["raw_text"] != ""]

# light text normalization for downstream models
import re
url_pattern = re.compile(r"http\S+")

def clean_text_value(s: str) -> str:
    s = str(s)
    s = url_pattern.sub("", s)
    s = s.replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

wsb["clean_text"] = wsb["raw_text"].map(clean_text_value).str.lower()


wsb.shape

(2956091, 14)

In [None]:
# Simple regexes
cashtag_pattern = re.compile(r'\$[A-Za-z]{1,5}')
upper_pattern   = re.compile(r'\b[A-Z]{2,5}\b')  # 2–5 uppercase letters

def extract_candidate_tokens(text):
    if not isinstance(text, str):
        return []

    cands = set()

    # $TSLA, $GME
    for m in cashtag_pattern.findall(text):
        cands.add(m[1:].upper())  # strip '$'

    # TSLA, GME, NVDA (bare tickers)
    for m in upper_pattern.findall(text):
        cands.add(m.upper())

    return list(cands)

wsb['candidate_tokens'] = wsb['raw_text'].apply(extract_candidate_tokens)


