In [34]:
import pandas as pd
import re
import yfinance as yf
from tqdm.auto import tqdm
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score
)
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer

In [None]:
# resave all the pdfs again becuase added EDIT to manual remove list

In [19]:
wsb = pd.read_csv("data/wsb.csv")
wsb

Unnamed: 0,register_index,post_id,comment_id,author,datetime,title,url,score,comments,text,author_post_karma,tag
0,14b78hkjoe86nf,14b78hk,joe86nf,scott_jr,2023-06-16 20:36:55,,,1.0,,Watch til 1 10,32102.0,Meme
1,14b71m2post,14b71m2,,merakibret,2023-06-16 20:24:01,I had my first ever big success with options t...,https://www.reddit.com/r/wallstreetbets/commen...,8.0,6.0,Entered an Iron Condor on ADBE yesterday at 45...,343.0,Gain
2,14b71m2joe6du9,14b71m2,joe6du9,VisualMod,2023-06-16 20:24:07,,,1.0,,User Report Tota...,725083.0,Gain
3,14b71m2joe6een,14b71m2,joe6een,VisualMod,2023-06-16 20:24:13,,,2.0,,That was a very wise move,725083.0,Gain
4,14b71m2joe7yy4,14b71m2,joe7yy4,DreamcatcherEgg,2023-06-16 20:35:23,,,2.0,,All you have to do is repeat this same winning...,6088.0,Gain
...,...,...,...,...,...,...,...,...,...,...,...,...
3033535,1j96owemhd8ajs,1j96owe,mhd8ajs,jarail,2025-03-12 11:25:28,,,4.0,,Hopefully he made the whole story up,,Discussion
3033536,1j96owemhcapeb,1j96owe,mhcapeb,South_Age974,2025-03-12 05:46:48,,,2.0,,holy f,,Discussion
3033537,1j96owemhjt5np,1j96owe,mhjt5np,PickinLosers,2025-03-13 11:49:05,,,2.0,,I like to call them PDFs public domain fries,,Discussion
3033538,1j96owemhex8ls,1j96owe,mhex8ls,The_Whackest,2025-03-12 17:04:15,,,1.0,,Singsongy Some make you laugh and others ma...,,Discussion


In [20]:
wsb.info()
wsb.head()
wsb.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033540 entries, 0 to 3033539
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   register_index     object 
 1   post_id            object 
 2   comment_id         object 
 3   author             object 
 4   datetime           object 
 5   title              object 
 6   url                object 
 7   score              float64
 8   comments           float64
 9   text               object 
 10  author_post_karma  float64
 11  tag                object 
dtypes: float64(3), object(9)
memory usage: 277.7+ MB


(3033540, 12)

In [21]:
# Clean and normalize dataset

# drop uneccessary columns
wsb = wsb.drop(columns=["register_index"])

# convert datetime column to datetime type and normalize to date only
wsb["datetime"] = pd.to_datetime(wsb["datetime"], errors="coerce").dt.normalize()

# convert numeric columns to int type and fill na with 0
numeric_cols = ["score", "comments", "author_post_karma"]
for col in numeric_cols:
    wsb[col] = pd.to_numeric(wsb[col], errors="coerce")
    wsb[col] = wsb[col].fillna(0).astype(int)

# clean and normalize tag column
wsb["tag"] = (
    wsb["tag"].fillna("unknown").str.strip().str.lower().str.replace(r"\s+", "_", regex=True)
)
wsb["tag"] = wsb["tag"].astype("category")

# identify post or comment
wsb["is_post"] = wsb["comment_id"].isna()

wsb.info()
wsb.head()
wsb.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3033540 entries, 0 to 3033539
Data columns (total 12 columns):
 #   Column             Dtype         
---  ------             -----         
 0   post_id            object        
 1   comment_id         object        
 2   author             object        
 3   datetime           datetime64[ns]
 4   title              object        
 5   url                object        
 6   score              int64         
 7   comments           int64         
 8   text               object        
 9   author_post_karma  int64         
 10  tag                category      
 11  is_post            bool          
dtypes: bool(1), category(1), datetime64[ns](1), int64(3), object(6)
memory usage: 237.2+ MB


(3033540, 12)

In [22]:
# basic filtering / denoising

# remove obvious bot/moderator authors
bot_users = ["VisualMod", "AutoModerator"]
wsb = wsb[~wsb["author"].isin(bot_users)]

# drop rows with deleted/empty content
garbage_tokens = {"", "[deleted]", "[removed]"}
title_clean = wsb["title"].fillna("").str.strip()
text_clean = wsb["text"].fillna("").str.strip()

title_garbage = title_clean.isin(garbage_tokens)
text_garbage = text_clean.isin(garbage_tokens)

post_mask = wsb["is_post"]
comment_mask = ~wsb["is_post"]

drop_mask = (post_mask & title_garbage & text_garbage) | (comment_mask & text_garbage)
wsb = wsb[~drop_mask]

wsb.shape

(2956091, 12)

In [23]:
# light initial text cleaning
# build canonical text field for NLP
title_component = wsb["title"].fillna("").astype(str)
text_component = wsb["text"].fillna("").astype(str)

post_raw_text = (title_component + "\n\n" + text_component).str.strip()
comment_raw_text = text_component.str.strip()

wsb["raw_text"] = post_raw_text.where(wsb["is_post"], comment_raw_text)
wsb = wsb[wsb["raw_text"] != ""]

# light text normalization for downstream models
import re
url_pattern = re.compile(r"http\S+")

def clean_text_value(s: str) -> str:
    s = str(s)
    s = url_pattern.sub("", s)
    s = s.replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

wsb["clean_text"] = wsb["raw_text"].map(clean_text_value).str.lower()


wsb.shape

(2956091, 14)

In [36]:
wsb = pd.read_csv('data/wsb_cleaned.csv')

In [37]:
# Simple regexes
cashtag_pattern = re.compile(r'\$[A-Za-z]{1,5}')
upper_pattern   = re.compile(r'\b[A-Z]{2,5}\b')  # 2–5 uppercase letters

def extract_candidate_tokens(text):
    if not isinstance(text, str):
        return []

    cands = set()

    # $TSLA, $GME
    for m in cashtag_pattern.findall(text):
        cands.add(m[1:].upper())  # strip '$'

    # TSLA, GME, NVDA (bare tickers)
    for m in upper_pattern.findall(text):
        cands.add(m.upper())

    return list(cands)

wsb['candidate_tokens'] = wsb['raw_text'].apply(extract_candidate_tokens)

tokens_exploded = wsb.explode('candidate_tokens')
tokens_exploded = tokens_exploded.dropna(subset=['candidate_tokens'])

token_counts = (
    tokens_exploded
    .groupby('candidate_tokens')
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

token_counts.head(30)





Unnamed: 0,candidate_tokens,count
11132,NVDA,49561
381,AI,40492
16203,US,29507
11519,OP,26281
14563,SPY,25820
17298,WSB,19541
15743,TSLA,17708
578,AMD,13938
2545,CEO,10445
3621,DD,9184


In [38]:
# -------------------------
# 1. Filter candidate tokens
# -------------------------
min_count = 500  # as you used; adjust if needed
freq_candidates = (
    token_counts[token_counts['count'] >= min_count]['candidate_tokens']
    .astype(str)
    .tolist()
)

print(f"Number of freq_candidates (count >= {min_count}): {len(freq_candidates)}")

# -------------------------------------------
# 2. Build valid_tickers via yfinance (whitelist)
# -------------------------------------------
start_date = '2023-06-08'
end_date   = '2025-04-02'

valid_tickers_all = []

for sym in tqdm(freq_candidates, desc="Checking yfinance symbols"):
    try:
        data = yf.download(sym, start=start_date, end=end_date, progress=False)
        if not data.empty:
            valid_tickers_all.append(sym)
    except Exception:
        # Skip symbols that cause errors
        continue

valid_tickers_all = sorted(set(valid_tickers_all))
print(f"Number of symbols with real price data: {len(valid_tickers_all)}")

# --------------------------------------------------
# 3. Take the top 150 valid tickers by WSB frequency
# --------------------------------------------------
vc = (
    token_counts[token_counts['candidate_tokens'].isin(valid_tickers_all)]
    .copy()
    .sort_values('count', ascending=False)
)

top_200 = vc['candidate_tokens'].head(200).tolist()
print(f"Initial top_200 tickers (before manual removal): {len(top_200)}")

# You can inspect this manually if you want:
# print(top_150)

# --------------------------------------------------
# 4. Remove obviously ambiguous / non-stock symbols
#    (we prune the top_150 list instead of defining a
#     separate manual_keep list)
# --------------------------------------------------
manual_drop = {
    # Common words / pronouns / fillers
    "IT", "AM", "ARE", "FOR", "ALL", "GO", "NOW", "OR", "JUST", "YOU",
    "MORE", "TIME", "LOT", "WAY", "BACK", "DAY",

    # Macro / econ / generic finance
    "US", "USA", "USD", "CPI", "PPI", "EPS", "IRS",
    "IPO", "ITM", "PM", "EOD",

    # Crypto / FX / non-equity focus
    "BTC", "ETH", "SOL", "DOGE", "ADA", "XRP", "USDT", "USDC",

    # Slang / memes / platform lingo
    "YOLO", "WTF", "IMO", "MOON", "GL",

    # Generic financial/technical terms (not single equities)
    "IV", "ETF", "RSI",

    # Regions / places
    "EU", "UK", "NYC", "CA", "DC",

    # Ambiguous tickers that are almost always normal words or other concepts
    "AI", "OP", "DD", "EV", "LINK", "TV", "ON", "UP", "VR", "PC", "SO",
    "IP", "CC", "IQ", "OPEN", "BE", "CAN", "MS", "OS", "PT", "PDT",
    "COST", "DEI", "OUT", "HE", "UI", "PR", "AM", "AGI", "ICE",

    # Misc abbreviations that are usually not equity tickers in WSB text
    "EPS", "CFO", "AA", "GPT", "HERE", "LOVE", "ANY", "EDIT"
}


filtered_candidates = [t for t in top_200 if t not in manual_drop]
print(f"After manual_drop filtering: {len(filtered_candidates)} symbols")

# ---------------------------------------------------------
# 5. From these filtered candidates, keep the TOP 100 by
#    WSB frequency (if we have >= 100; else keep all)
# ---------------------------------------------------------
vc_filtered = vc[vc['candidate_tokens'].isin(filtered_candidates)].copy()
vc_filtered = vc_filtered.sort_values('count', ascending=False)

top_n = 100
final_tickers = vc_filtered['candidate_tokens'].head(top_n).tolist()
print(f"Final ticker universe size (up to {top_n}): {len(final_tickers)}")
print("Preview final_tickers:", final_tickers[:20])

# Save final universe to JSON for verification
valid_tickers_path = 'data/valid_tickers_top100.json'
with open(valid_tickers_path, 'w') as f:
    json.dump(final_tickers, f, indent=2)

print(f"Saved final tickers to {valid_tickers_path}")

valid_tickers_set = set(final_tickers)

# ------------------------------------------------
# 6. Re-extract tickers from WSB text using whitelist
# ------------------------------------------------
cashtag_pattern = re.compile(r'\$[A-Za-z]{1,5}')
upper_pattern   = re.compile(r'\b[A-Z]{2,5}\b')

def extract_tickers(text):
    if not isinstance(text, str):
        return []

    cands = set()

    # $TSLA/$GME style
    for m in cashtag_pattern.findall(text):
        cands.add(m[1:].upper())

    # TSLA/GME style (bare)
    for m in upper_pattern.findall(text):
        cands.add(m.upper())

    # keep only in the curated top-100 universe
    return [t for t in cands if t in valid_tickers_set]

wsb['tickers'] = wsb['raw_text'].apply(extract_tickers)

# Keep only rows that mention at least one of our final tickers
wsb_with_ticker = wsb[wsb['tickers'].str.len() > 0].copy()

# One row per (WSB row, ticker)
wsb_exploded = (
    wsb_with_ticker
    .explode('tickers')
    .rename(columns={'tickers': 'ticker'})
)

print("Exploded WSB shape:", wsb_exploded.shape)
print(wsb_exploded[['datetime', 'ticker']].head())

# Optional: sanity check ticker counts
print("\nTop 20 tickers after final filter:")
print(wsb_exploded['ticker'].value_counts().head(20))

#wsb_exploded.to_csv('data/wsb_exploded.csv', index=False)

Number of freq_candidates (count >= 500): 303


  data = yf.download(sym, start=start_date, end=end_date, progress=False)
  data = yf.download(sym, start=start_date, end=end_date, progress=False)
  data = yf.download(sym, start=start_date, end=end_date, progress=False)

1 Failed download:
['US']: YFPricesMissingError('possibly delisted; no price data found  (1d 2023-06-08 -> 2025-04-02)')
  data = yf.download(sym, start=start_date, end=end_date, progress=False)
  data = yf.download(sym, start=start_date, end=end_date, progress=False)
  data = yf.download(sym, start=start_date, end=end_date, progress=False)

1 Failed download:
['WSB']: YFPricesMissingError('possibly delisted; no price data found  (1d 2023-06-08 -> 2025-04-02)')
  data = yf.download(sym, start=start_date, end=end_date, progress=False)
  data = yf.download(sym, start=start_date, end=end_date, progress=False)
  data = yf.download(sym, start=start_date, end=end_date, progress=False)

1 Failed download:
['CEO']: YFTzMissingError('possibly delisted; no timezone found')
  d

Number of symbols with real price data: 161
Initial top_200 tickers (before manual removal): 161
After manual_drop filtering: 86 symbols
Final ticker universe size (up to 100): 86
Preview final_tickers: ['NVDA', 'SPY', 'TSLA', 'AMD', 'SMCI', 'QQQ', 'AAPL', 'MSFT', 'TSM', 'MSTR', 'AMC', 'PLTR', 'INTC', 'DJT', 'RH', 'META', 'ASTS', 'ARM', 'AMZN', 'MARA']
Saved final tickers to data/valid_tickers_top100.json
Exploded WSB shape: (244346, 17)
      datetime ticker
1   2023-06-16   ADBE
13  2023-06-16    SPY
26  2023-06-16     TD
30  2023-06-16    IWM
32  2023-06-16    SPY

Top 20 tickers after final filter:
ticker
NVDA    49561
SPY     25820
TSLA    17708
AMD     13938
SMCI     5930
QQQ      5655
AAPL     5422
MSFT     4839
TSM      4827
MSTR     4191
AMC      4121
PLTR     3992
INTC     3688
DJT      3673
RH       3511
META     3479
ASTS     3062
ARM      2828
AMZN     2714
MARA     2663
Name: count, dtype: int64


In [39]:
wsb_exploded.shape

(244346, 17)

In [40]:
# ---------------------------------------------
# 1. Load your final ticker universe from JSON
# ---------------------------------------------
tickers_path = 'data/valid_tickers_top100.json'  # adjust if different

with open(tickers_path, 'r') as f:
    final_tickers = json.load(f)

print(f"# tickers: {len(final_tickers)}")
print("Preview:", final_tickers[:10])

# ---------------------------------------------
# 2. Define date range
# ---------------------------------------------
start_date = '2023-06-01'
end_date   = '2025-04-02'

# ---------------------------------------------
# 3. Single multi-ticker download (WIDE, MultiIndex)
# ---------------------------------------------
raw = yf.download(
    final_tickers,
    start=start_date,
    end=end_date,
    auto_adjust=False,   # keep raw OHLC
    progress=False
)

# raw is wide with MultiIndex columns: (PriceField, Ticker)
print("Raw shape:", raw.shape)
print("Column level names:", raw.columns.names)
print(raw.head(3))

# ---------------------------------------------
# 4. Reshape to LONG: one row per (date, ticker)
# ---------------------------------------------
# Ensure column levels have names (should already be ['Price', 'Ticker'])
if raw.columns.names is None or len(raw.columns.names) != 2:
    # yfinance usually sets this, but just in case:
    raw.columns.names = ['Price', 'Ticker']

# Move 'Ticker' from column level to a column via stack
# After this:
#   index: Date
#   column: Price fields (Open, High, Low, Close, Adj Close, Volume)
#   plus a 'Ticker' column from stacking
prices_long = (
    raw
    .stack(level='Ticker')          # stack over ticker level → ticker becomes index level
    .reset_index()                  # turn index back into columns
    .rename(columns={'Date': 'datetime', 'Ticker': 'ticker'})
)

# Now columns should be: ['datetime','ticker','Open','High','Low','Close','Adj Close','Volume']
print("After stack → long shape:", prices_long.shape)
print(prices_long.head(3))
print("Columns after stack:", prices_long.columns.tolist())

# ---------------------------------------------
# 5. Clean column names & add 'date'
# ---------------------------------------------
prices_long = prices_long.rename(
    columns={
        'Open': 'open',
        'High': 'high',
        'Low': 'low',
        'Close': 'close',
        'Adj Close': 'adj_close',
        'Volume': 'volume'
    }
)

prices_long['datetime'] = pd.to_datetime(prices_long['datetime'])

# Sort for sanity
prices_long = prices_long.sort_values(['ticker', 'datetime']).reset_index(drop=True)

print("Final LONG price data shape:", prices_long.shape)
print(prices_long.head(5))
print("Final columns:", prices_long.columns.tolist())

out_path = 'data/prices_daily.csv'
prices_long.to_csv(out_path, index=False)

# tickers: 86
Preview: ['NVDA', 'SPY', 'TSLA', 'AMD', 'SMCI', 'QQQ', 'AAPL', 'MSFT', 'TSM', 'MSTR']
Raw shape: (460, 516)
Column level names: ['Price', 'Ticker']
Price        Adj Close                                                       \
Ticker            AAPL  ACHR        ADBE        AMC         AMD        AMZN   
Date                                                                          
2023-06-01  177.930145  2.99  426.750000  45.500000  119.470001  122.769997   
2023-06-02  178.779831  2.96  436.369995  45.500000  117.860001  124.250000   
2023-06-05  177.426254  3.00  434.179993  46.299999  117.930000  125.300003   

Price                                    ...    Volume                       \
Ticker         AR ARM        ASML  ASTS  ...      TLRY       TLT       TQQQ   
Date                                     ...                                  
2023-06-01  20.16 NaN  710.749329  5.60  ...  26101500  21040400  216759200   
2023-06-02  21.32 NaN  708.676025  5.62  ...  2

  .stack(level='Ticker')          # stack over ticker level → ticker becomes index level


In [41]:
prices_long.shape

(39122, 8)

In [42]:
# 0. Normalize datetime to day-level but KEEP column name 'datetime'
prices_long['datetime'] = pd.to_datetime(prices_long['datetime']).dt.normalize()
wsb_exploded['datetime'] = pd.to_datetime(wsb_exploded['datetime']).dt.normalize()

# ===================================================
# 1. PRICE-SIDE FEATURES: returns + next-day label
# ===================================================
prices_feat = prices_long.sort_values(['ticker', 'datetime']).copy()

# Daily return
prices_feat['return'] = prices_feat.groupby('ticker')['close'].pct_change()

# Next-day close and next-day return (per ticker)
prices_feat['next_close'] = prices_feat.groupby('ticker')['close'].shift(-1)
prices_feat['next_return'] = (
    (prices_feat['next_close'] - prices_feat['close']) / prices_feat['close']
)

# Big-move label
BIG_MOVE_THRESH = 0.05  # 5% threshold; adjust if you want
prices_feat['big_move'] = (
    prices_feat['next_return'].abs() >= BIG_MOVE_THRESH
).astype(int)

# Drop rows with no next-day info
prices_feat = prices_feat.dropna(subset=['next_return'])

price_cols = [
    'datetime',   # daily key
    'ticker',
    'close',
    'volume',
    'return',
    'next_return',
    'big_move'
]
prices_feat = prices_feat[price_cols]

print("Price feature shape:", prices_feat.shape)
print(prices_feat.head())

# ===================================================
# 2. WSB-SIDE NUMERIC FEATURES per (datetime, ticker)
# ===================================================
wsb_num = wsb_exploded.copy()

# is_post: posts have NaN comment_id, comments have non-null
if 'is_post' not in wsb_num.columns:
    wsb_num['is_post'] = wsb_num['comment_id'].isna()

# score numeric
wsb_num['score'] = pd.to_numeric(wsb_num['score'], errors='coerce')

group_cols = ['datetime', 'ticker']

agg_dict = {
    'score': ['count', 'sum', 'mean'],
    'is_post': 'mean',
    'author': pd.Series.nunique,
}

wsb_daily = (
    wsb_num
    .groupby(group_cols)
    .agg(agg_dict)
)

# Flatten columns
wsb_daily.columns = [
    '_'.join([c for c in col if c]) for col in wsb_daily.columns.ravel()
]
wsb_daily = wsb_daily.reset_index()

wsb_daily = wsb_daily.rename(columns={
    'score_count': 'mention_count',
    'score_sum': 'score_sum',
    'score_mean': 'score_mean',
    'is_post_mean': 'post_fraction',
    'author_nunique': 'unique_authors'
})

print("WSB numeric daily feature shape:", wsb_daily.shape)
print(wsb_daily.head())

# ===================================================
# 3. WSB TEXT FEATURES: doc_text per (datetime, ticker)
# ===================================================
wsb_text = wsb_exploded.copy()

# Find the text column
TEXT_COL_CANDIDATES = ['clean_text', 'text', 'raw_text']
for c in TEXT_COL_CANDIDATES:
    if c in wsb_text.columns:
        text_col = c
        break
else:
    raise ValueError("No text column found in wsb_exploded (expected one of: clean_text, text, raw_text)")

# Ensure string
wsb_text[text_col] = wsb_text[text_col].fillna('').astype(str)

wsb_text_daily = (
    wsb_text
    .groupby(['datetime', 'ticker'])[text_col]
    .apply(lambda s: ' '.join(s))
    .reset_index()
    .rename(columns={text_col: 'doc_text'})
)

print("WSB text daily shape:", wsb_text_daily.shape)
print(wsb_text_daily.head())

# ===================================================
# 4. MERGE: prices + WSB numeric + WSB text
# ===================================================
model_df = (
    prices_feat
    .merge(wsb_daily, on=['datetime', 'ticker'], how='inner')
    .merge(wsb_text_daily, on=['datetime', 'ticker'], how='inner')
)

print("Model DF shape:", model_df.shape)
print(model_df.head())
print("Big-move base rate:", model_df['big_move'].mean())


# out_path = 'data/modeling_df.csv'
# model_df.to_csv(out_path, index=False)


Price feature shape: (39036, 7)
Price   datetime ticker       close       volume    return  next_return  \
0     2023-06-01   AAPL  180.089996   68901800.0       NaN     0.004775   
1     2023-06-02   AAPL  180.949997   61996900.0  0.004775    -0.007571   
2     2023-06-05   AAPL  179.580002  121946500.0 -0.007571    -0.002060   
3     2023-06-06   AAPL  179.210007   64848400.0 -0.002060    -0.007756   
4     2023-06-07   AAPL  177.820007   61944600.0 -0.007756     0.015465   

Price  big_move  
0             0  
1             0  
2             0  
3             0  
4             0  
WSB numeric daily feature shape: (25691, 7)
    datetime ticker  mention_count  score_sum  score_mean  post_fraction  \
0 2023-06-08   ADBE              2         24        12.0            0.5   
1 2023-06-08   AMZN              1          7         7.0            0.0   
2 2023-06-08     AR              1         44        44.0            1.0   
3 2023-06-08   BABA              1          6         6.0    

In [43]:
# baseline models finally

In [44]:
# ---------------------------------------------------
# 0. Inspect / lightly clean model_df
#    Assumes model_df is already in memory
#    Columns used: datetime, ticker, big_move, return, volume,
#    mention_count, score_sum, score_mean, post_fraction, unique_authors
# ---------------------------------------------------

# Ensure datetime is datetime64
model_df['datetime'] = pd.to_datetime(model_df['datetime'])

# Log-transform volume to reduce skew, keep both if you want
model_df['log_volume'] = np.log1p(model_df['volume'])

# Fill NaNs in WSB features with 0 (no activity)
wsb_cols = ['mention_count', 'score_sum', 'score_mean', 'post_fraction', 'unique_authors']
for c in wsb_cols:
    if c in model_df.columns:
        model_df[c] = model_df[c].fillna(0)

# Drop any rows missing the core numeric features or label
core_numeric = ['return', 'log_volume'] + wsb_cols
model_df = model_df.dropna(subset=core_numeric + ['big_move'])

print("Model df after cleaning:", model_df.shape)

# ---------------------------------------------------
# 1. Time-based train/val/test split
# ---------------------------------------------------

def time_split(df, train_frac=0.7, val_frac=0.15):
    """
    Split by datetime (chronological). No shuffling.
    Returns train_df, val_df, test_df.
    """
    df = df.sort_values('datetime')
    unique_times = df['datetime'].unique()
    n = len(unique_times)
    
    train_cut = int(n * train_frac)
    val_cut   = int(n * (train_frac + val_frac))
    
    train_times = unique_times[:train_cut]
    val_times   = unique_times[train_cut:val_cut]
    test_times  = unique_times[val_cut:]
    
    train_df = df[df['datetime'].isin(train_times)].copy()
    val_df   = df[df['datetime'].isin(val_times)].copy()
    test_df  = df[df['datetime'].isin(test_times)].copy()
    
    print(f"Train dates: {train_times[0]} → {train_times[-1]}  ({len(train_times)} days)")
    print(f"Val dates:   {val_times[0]} → {val_times[-1]}      ({len(val_times)} days)")
    print(f"Test dates:  {test_times[0]} → {test_times[-1]}     ({len(test_times)} days)")
    
    return train_df, val_df, test_df

train_df, val_df, test_df = time_split(model_df)

print("Split sizes:",
      len(train_df), "train /",
      len(val_df), "val /",
      len(test_df), "test")

# ---------------------------------------------------
# 2. Helper: train + eval a logistic baseline on given feature list
# ---------------------------------------------------

def run_logistic_baseline(feature_cols, train_df, val_df, test_df, desc=""):
    print("\n" + "="*80)
    print(f"Baseline: {desc}")
    print("="*80)
    
    X_train = train_df[feature_cols].values
    y_train = train_df['big_move'].values.astype(int)
    
    X_val   = val_df[feature_cols].values
    y_val   = val_df['big_move'].values.astype(int)
    
    X_test  = test_df[feature_cols].values
    y_test  = test_df['big_move'].values.astype(int)
    
    # Pipeline: scale numeric features, then logistic regression
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            penalty='l2',
            C=1.0,
            class_weight='balanced',  # helps with class imbalance
            max_iter=1000,
            solver='lbfgs'
        ))
    ])
    
    pipe.fit(X_train, y_train)
    
    # Validation metrics
    val_probs = pipe.predict_proba(X_val)[:, 1]
    val_pred  = (val_probs >= 0.5).astype(int)
    
    def report_split(name, y_true, y_pred, y_prob):
        acc = accuracy_score(y_true, y_pred)
        f1  = f1_score(y_true, y_pred, zero_division=0)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec  = recall_score(y_true, y_pred, zero_division=0)
        try:
            auc = roc_auc_score(y_true, y_prob)
        except ValueError:
            auc = np.nan
        
        print(f"{name}:")
        print(f"  Accuracy : {acc:.4f}")
        print(f"  F1       : {f1:.4f}")
        print(f"  Precision: {prec:.4f}")
        print(f"  Recall   : {rec:.4f}")
        print(f"  AUC      : {auc:.4f}")
    
    print("\nValidation performance:")
    report_split("VAL", y_val, val_pred, val_probs)
    
    # Test metrics (final)
    test_probs = pipe.predict_proba(X_test)[:, 1]
    test_pred  = (test_probs >= 0.5).astype(int)
    
    print("\nTest performance:")
    report_split("TEST", y_test, test_pred, test_probs)
    
    return pipe

# ---------------------------------------------------
# 3. Baseline 0: dumb majority classifier (always 0)
# ---------------------------------------------------

print("\n" + "="*80)
print("Baseline 0: Always predict NO BIG MOVE")
print("="*80)

y_test = test_df['big_move'].values.astype(int)
y_pred0 = np.zeros_like(y_test)

acc0 = accuracy_score(y_test, y_pred0)
f1_0 = f1_score(y_test, y_pred0, zero_division=0)
prec0 = precision_score(y_test, y_pred0, zero_division=0)
rec0 = recall_score(y_test, y_pred0, zero_division=0)

print(f"TEST:")
print(f"  Accuracy : {acc0:.4f}")
print(f"  F1       : {f1_0:.4f}")
print(f"  Precision: {prec0:.4f}")
print(f"  Recall   : {rec0:.4f}")
print("  (AUC for this is undefined / trivial; model outputs constant score)")

# ---------------------------------------------------
# 4. Baseline 1: price-only logistic
# ---------------------------------------------------

price_features = ['return', 'log_volume']
pipe_price = run_logistic_baseline(
    feature_cols=price_features,
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    desc="Logistic (price-only)"
)

# ---------------------------------------------------
# 5. Baseline 2: WSB-metadata-only logistic
# ---------------------------------------------------

wsb_features = ['mention_count', 'score_sum', 'score_mean', 'post_fraction', 'unique_authors']
pipe_wsb = run_logistic_baseline(
    feature_cols=wsb_features,
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    desc="Logistic (WSB-only numeric)"
)

# ---------------------------------------------------
# 6. Baseline 3: price + WSB numeric logistic
# ---------------------------------------------------

combined_features = price_features + wsb_features
pipe_combined = run_logistic_baseline(
    feature_cols=combined_features,
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    desc="Logistic (price + WSB numeric)"
)


Model df after cleaning: (18096, 14)
Train dates: 2023-06-08 00:00:00 → 2024-07-08 00:00:00  (259 days)
Val dates:   2024-07-09 00:00:00 → 2024-11-27 00:00:00      (55 days)
Test dates:  2024-11-29 00:00:00 → 2025-03-31 00:00:00     (56 days)
Split sizes: 12645 train / 2801 val / 2650 test

Baseline 0: Always predict NO BIG MOVE
TEST:
  Accuracy : 0.8574
  F1       : 0.0000
  Precision: 0.0000
  Recall   : 0.0000
  (AUC for this is undefined / trivial; model outputs constant score)

Baseline: Logistic (price-only)

Validation performance:
VAL:
  Accuracy : 0.5637
  F1       : 0.2085
  Precision: 0.1348
  Recall   : 0.4600
  AUC      : 0.5422

Test performance:
TEST:
  Accuracy : 0.5325
  F1       : 0.2629
  Precision: 0.1696
  Recall   : 0.5847
  AUC      : 0.5993

Baseline: Logistic (WSB-only numeric)

Validation performance:
VAL:
  Accuracy : 0.7169
  F1       : 0.2014
  Precision: 0.1555
  Recall   : 0.2857
  AUC      : 0.5333

Test performance:
TEST:
  Accuracy : 0.7366
  F1       

In [45]:
# ---------------------------------------------------
# 0. Check we have the splits and doc_text
# ---------------------------------------------------
for df_name, df in [('train_df', train_df), ('val_df', val_df), ('test_df', test_df)]:
    if 'doc_text' not in df.columns:
        raise ValueError(f"{df_name} is missing 'doc_text' column")

# Make sure text is string
for df in (train_df, val_df, test_df):
    df['doc_text'] = df['doc_text'].fillna('').astype(str)

# Target
y_train = train_df['big_move'].astype(int).values
y_val   = val_df['big_move'].astype(int).values
y_test  = test_df['big_move'].astype(int).values

# ---------------------------------------------------
# 1. TF–IDF features on doc_text
# ---------------------------------------------------

tfidf = TfidfVectorizer(
    max_features=10000,   # cap vocab size so it doesn't blow up
    min_df=5,             # ignore ultra-rare words
    max_df=0.7,           # ignore overly common words
    ngram_range=(1, 2),   # unigrams + bigrams
)

X_train_text = tfidf.fit_transform(train_df['doc_text'])
X_val_text   = tfidf.transform(val_df['doc_text'])
X_test_text  = tfidf.transform(test_df['doc_text'])

print("TF–IDF shapes:",
      X_train_text.shape, X_val_text.shape, X_test_text.shape)

# ---------------------------------------------------
# Helper: evaluate a classifier
# ---------------------------------------------------

def eval_classifier(clf_name, y_true, y_prob, thr=0.5):
    y_pred = (y_prob >= thr).astype(int)
    acc  = accuracy_score(y_true, y_pred)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except ValueError:
        auc = np.nan
    print(f"{clf_name}:")
    print(f"  Accuracy : {acc:.4f}")
    print(f"  F1       : {f1:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall   : {rec:.4f}")
    print(f"  AUC      : {auc:.4f}")
    print()

# ---------------------------------------------------
# 2. Text-only logistic regression
# ---------------------------------------------------

logit_text = LogisticRegression(
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    max_iter=1000,
    solver='lbfgs'
)

logit_text.fit(X_train_text, y_train)

val_probs_text  = logit_text.predict_proba(X_val_text)[:, 1]
test_probs_text = logit_text.predict_proba(X_test_text)[:, 1]

print("\n======================")
print("TEXT-ONLY LOGISTIC")
print("======================\n")
print("Validation performance:")
eval_classifier("VAL (text-only)", y_val, val_probs_text)

print("Test performance:")
eval_classifier("TEST (text-only)", y_test, test_probs_text)

# ---------------------------------------------------
# 3. Text + numeric logistic regression
# ---------------------------------------------------

# Numeric feature set (same as your combined baseline)
num_cols = [
    'return',
    'log_volume',
    'mention_count',
    'score_sum',
    'score_mean',
    'post_fraction',
    'unique_authors',
]

# Make sure these exist & no NaNs
for c in num_cols:
    if c not in train_df.columns:
        raise ValueError(f"Numeric feature '{c}' missing from train_df")
    train_df[c] = train_df[c].fillna(0)
    val_df[c]   = val_df[c].fillna(0)
    test_df[c]  = test_df[c].fillna(0)

X_train_num = train_df[num_cols].values
X_val_num   = val_df[num_cols].values
X_test_num  = test_df[num_cols].values

# Scale numeric part
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_val_num_scaled   = scaler.transform(X_val_num)
X_test_num_scaled  = scaler.transform(X_test_num)

# Convert numeric to sparse and hstack with TF–IDF
X_train_full = hstack([X_train_text, csr_matrix(X_train_num_scaled)])
X_val_full   = hstack([X_val_text,   csr_matrix(X_val_num_scaled)])
X_test_full  = hstack([X_test_text,  csr_matrix(X_test_num_scaled)])

print("Combined feature shapes:",
      X_train_full.shape, X_val_full.shape, X_test_full.shape)

logit_full = LogisticRegression(
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    max_iter=1000,
    solver='lbfgs'
)

logit_full.fit(X_train_full, y_train)

val_probs_full  = logit_full.predict_proba(X_val_full)[:, 1]
test_probs_full = logit_full.predict_proba(X_test_full)[:, 1]

print("\n======================")
print("TEXT + NUMERIC LOGISTIC")
print("======================\n")
print("Validation performance:")
eval_classifier("VAL (text+num)", y_val, val_probs_full)

print("Test performance:")
eval_classifier("TEST (text+num)", y_test, test_probs_full)


TF–IDF shapes: (12645, 10000) (2801, 10000) (2650, 10000)

TEXT-ONLY LOGISTIC

Validation performance:
VAL (text-only):
  Accuracy : 0.7719
  F1       : 0.3654
  Precision: 0.2801
  Recall   : 0.5257
  AUC      : 0.7565

Test performance:
TEST (text-only):
  Accuracy : 0.7419
  F1       : 0.4011
  Precision: 0.2997
  Recall   : 0.6058
  AUC      : 0.7729

Combined feature shapes: (12645, 10007) (2801, 10007) (2650, 10007)

TEXT-ONLY LOGISTIC

Validation performance:
VAL (text-only):
  Accuracy : 0.7719
  F1       : 0.3654
  Precision: 0.2801
  Recall   : 0.5257
  AUC      : 0.7565

Test performance:
TEST (text-only):
  Accuracy : 0.7419
  F1       : 0.4011
  Precision: 0.2997
  Recall   : 0.6058
  AUC      : 0.7729

Combined feature shapes: (12645, 10007) (2801, 10007) (2650, 10007)

TEXT + NUMERIC LOGISTIC

Validation performance:
VAL (text+num):
  Accuracy : 0.7836
  F1       : 0.3701
  Precision: 0.2908
  Recall   : 0.5086
  AUC      : 0.7583

Test performance:
TEST (text+num):
  Ac

In [46]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from scipy.sparse import hstack, csr_matrix

# ---------------------------------------------------
# 0. Basic setup: targets, numeric features, text
# ---------------------------------------------------
for df_name, df in [('train_df', train_df), ('val_df', val_df), ('test_df', test_df)]:
    if 'doc_text' not in df.columns:
        raise ValueError(f"{df_name} missing 'doc_text'")
    df['doc_text'] = df['doc_text'].fillna('').astype(str)

y_train = train_df['big_move'].astype(int).values
y_val   = val_df['big_move'].astype(int).values
y_test  = test_df['big_move'].astype(int).values

num_cols = [
    'return',
    'log_volume',
    'mention_count',
    'score_sum',
    'score_mean',
    'post_fraction',
    'unique_authors',
]

for c in num_cols:
    if c not in train_df.columns:
        raise ValueError(f"Numeric feature '{c}' missing from train_df")
    train_df[c] = train_df[c].fillna(0)
    val_df[c]   = val_df[c].fillna(0)
    test_df[c]  = test_df[c].fillna(0)

X_train_num = train_df[num_cols].values
X_val_num   = val_df[num_cols].values
X_test_num  = test_df[num_cols].values

scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_val_num_scaled   = scaler.transform(X_val_num)
X_test_num_scaled  = scaler.transform(X_test_num)

# ---------------------------------------------------
# 1. Helper: evaluate metrics at threshold=0.5
# ---------------------------------------------------
def eval_metrics(y_true, y_prob, desc=""):
    thr = 0.5
    y_pred = (y_prob >= thr).astype(int)
    acc  = accuracy_score(y_true, y_pred)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except ValueError:
        auc = np.nan
    print(desc)
    print(f"  Accuracy : {acc:.4f}")
    print(f"  F1       : {f1:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall   : {rec:.4f}")
    print(f"  AUC      : {auc:.4f}")
    print()
    return {
        'acc': acc, 'f1': f1, 'prec': prec, 'rec': rec, 'auc': auc
    }

# ---------------------------------------------------
# 2. Hyperparameter grids
# ---------------------------------------------------

tfidf_grid = [
    {'max_features': 5000,  'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7},
    {'max_features': 10000, 'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7},
    {'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 5, 'max_df': 0.7},
    {'max_features': 20000, 'ngram_range': (1, 2), 'min_df': 5, 'max_df': 0.7},
]

logreg_grid = [
    {'C': 0.1, 'class_weight': 'balanced'},
    {'C': 1.0, 'class_weight': 'balanced'},
    {'C': 10.0, 'class_weight': 'balanced'},
    {'C': 1.0, 'class_weight': None},
]

print("Starting hyperparameter search...")

results = []
best_model_artifacts = None  # store best vectorizer + model

# ---------------------------------------------------
# 3. Manual search over (tfidf_params x logreg_params)
# ---------------------------------------------------
for tfidf_params in tfidf_grid:
    # Build and fit vectorizer on TRAIN ONLY
    tfidf = TfidfVectorizer(
        max_features=tfidf_params['max_features'],
        min_df=tfidf_params['min_df'],
        max_df=tfidf_params['max_df'],
        ngram_range=tfidf_params['ngram_range']
    )
    X_train_text = tfidf.fit_transform(train_df['doc_text'])
    X_val_text   = tfidf.transform(val_df['doc_text'])

    # Combine text + numeric
    X_train_full = hstack([X_train_text, csr_matrix(X_train_num_scaled)])
    X_val_full   = hstack([X_val_text,   csr_matrix(X_val_num_scaled)])

    for log_params in logreg_grid:
        print(f"Trying TFIDF={tfidf_params}, LOGREG={log_params} ...")

        clf = LogisticRegression(
            penalty='l2',
            C=log_params['C'],
            class_weight=log_params['class_weight'],
            max_iter=1000,
            solver='lbfgs'
        )
        clf.fit(X_train_full, y_train)

        val_probs = clf.predict_proba(X_val_full)[:, 1]
        metrics = eval_metrics(y_val, val_probs, desc="VAL")

        results.append({
            'tfidf': tfidf_params,
            'logreg': log_params,
            'metrics': metrics
        })

        # Track best by F1 on validation
        if (best_model_artifacts is None) or (metrics['f1'] > best_model_artifacts['metrics']['f1']):
            best_model_artifacts = {
                'tfidf': tfidf,
                'tfidf_params': tfidf_params,
                'logreg': clf,
                'logreg_params': log_params,
                'metrics': metrics
            }

# ---------------------------------------------------
# 4. Show top configs by F1 on validation
# ---------------------------------------------------
results_sorted = sorted(results, key=lambda r: r['metrics']['f1'], reverse=True)
print("\nTop 5 configs by VAL F1:")
for r in results_sorted[:5]:
    m = r['metrics']
    print(f"TFIDF={r['tfidf']}, LOGREG={r['logreg']}, "
          f"F1={m['f1']:.4f}, Prec={m['prec']:.4f}, Rec={m['rec']:.4f}, AUC={m['auc']:.4f}")

print("\nBest config (by VAL F1):")
print("TFIDF params:", best_model_artifacts['tfidf_params'])
print("LOGREG params:", best_model_artifacts['logreg_params'])
print("VAL metrics:", best_model_artifacts['metrics'])

# ---------------------------------------------------
# 5. Evaluate best config on TEST
#    (Rebuild features for test with best tfidf and same scaler)
# ---------------------------------------------------
best_tfidf = best_model_artifacts['tfidf']
best_clf   = best_model_artifacts['logreg']

X_test_text = best_tfidf.transform(test_df['doc_text'])
X_test_full = hstack([X_test_text, csr_matrix(X_test_num_scaled)])

test_probs = best_clf.predict_proba(X_test_full)[:, 1]
print("\n=== BEST MODEL ON TEST ===")
test_metrics = eval_metrics(y_test, test_probs, desc="TEST (best tuned model)")

# ---------------------------------------------------
# 6. OPTIONAL: Threshold tuning on validation for best model
# ---------------------------------------------------

def find_best_threshold(y_true, y_prob, metric='f1'):
    thresholds = np.linspace(0.1, 0.9, 17)  # 0.1, 0.15, ..., 0.9
    best_thr = 0.5
    best_val = -1
    records = []
    for thr in thresholds:
        y_pred = (y_prob >= thr).astype(int)
        f1   = f1_score(y_true, y_pred, zero_division=0)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec  = recall_score(y_true, y_pred, zero_division=0)
        records.append((thr, f1, prec, rec))
        if metric == 'f1':
            val = f1
        elif metric == 'recall':
            val = rec
        else:
            val = f1
        if val > best_val:
            best_val = val
            best_thr = thr
    return best_thr, records

# Get validation probs for best model
X_val_text_best = best_tfidf.transform(val_df['doc_text'])
X_val_full_best = hstack([X_val_text_best, csr_matrix(X_val_num_scaled)])
val_probs_best  = best_clf.predict_proba(X_val_full_best)[:, 1]

best_thr, thr_records = find_best_threshold(y_val, val_probs_best, metric='f1')
print(f"\nBest threshold on VAL by F1: {best_thr:.2f}")
for thr, f1v, precv, recv in thr_records:
    print(f"thr={thr:.2f}  F1={f1v:.3f}  Prec={precv:.3f}  Rec={recv:.3f}")

# Evaluate tuned threshold on TEST
y_test_pred_tuned = (test_probs >= best_thr).astype(int)
acc_t = accuracy_score(y_test, y_test_pred_tuned)
f1_t  = f1_score(y_test, y_test_pred_tuned, zero_division=0)
prec_t= precision_score(y_test, y_test_pred_tuned, zero_division=0)
rec_t = recall_score(y_test, y_test_pred_tuned, zero_division=0)
print(f"\nTEST with tuned threshold={best_thr:.2f}:")
print(f"  Accuracy : {acc_t:.4f}")
print(f"  F1       : {f1_t:.4f}")
print(f"  Precision: {prec_t:.4f}")
print(f"  Recall   : {rec_t:.4f}")


Starting hyperparameter search...
Trying TFIDF={'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7}, LOGREG={'C': 0.1, 'class_weight': 'balanced'} ...
Trying TFIDF={'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7}, LOGREG={'C': 0.1, 'class_weight': 'balanced'} ...
VAL
  Accuracy : 0.7519
  F1       : 0.3583
  Precision: 0.2647
  Recall   : 0.5543
  AUC      : 0.7535

Trying TFIDF={'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7}, LOGREG={'C': 1.0, 'class_weight': 'balanced'} ...
VAL
  Accuracy : 0.7519
  F1       : 0.3583
  Precision: 0.2647
  Recall   : 0.5543
  AUC      : 0.7535

Trying TFIDF={'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7}, LOGREG={'C': 1.0, 'class_weight': 'balanced'} ...
VAL
  Accuracy : 0.7594
  F1       : 0.3642
  Precision: 0.2718
  Recall   : 0.5514
  AUC      : 0.7539

Trying TFIDF={'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5, 'max_df': 0.7}, LOGREG={'C'

In [47]:
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
)

# ---------------------------------------------------
# 0. Prepare train+val for CV and keep test separate
# ---------------------------------------------------

# Make sure datetime is sorted
train_df = train_df.sort_values('datetime')
val_df   = val_df.sort_values('datetime')
test_df  = test_df.sort_values('datetime')

trainval_df = pd.concat([train_df, val_df], axis=0).sort_values('datetime')

# Columns
text_col = 'doc_text'
num_cols = [
    'return',
    'log_volume',
    'mention_count',
    'score_sum',
    'score_mean',
    'post_fraction',
    'unique_authors',
]

# Fill NaNs
trainval_df[text_col] = trainval_df[text_col].fillna('').astype(str)
test_df[text_col]     = test_df[text_col].fillna('').astype(str)

for c in num_cols:
    trainval_df[c] = trainval_df[c].fillna(0)
    test_df[c]     = test_df[c].fillna(0)

X_trainval = trainval_df[[text_col] + num_cols]
y_trainval = trainval_df['big_move'].astype(int).values

X_test = test_df[[text_col] + num_cols]
y_test = test_df['big_move'].astype(int).values

# ---------------------------------------------------
# 1. Build preprocessing + model pipeline
# ---------------------------------------------------

# ColumnTransformer: apply TF-IDF to text, StandardScaler to numeric
preprocess = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), text_col),
        ('num', StandardScaler(), num_cols),
    ],
    remainder='drop'
)

pipe = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000))
])

# ---------------------------------------------------
# 2. Define TimeSeriesSplit and hyperparameter grid
# ---------------------------------------------------

# TimeSeriesSplit: respects chronological order
tss = TimeSeriesSplit(
    n_splits=5  # 5 folds across time on train+val
)

param_grid = {
    # TF-IDF hyperparams
    'prep__tfidf__max_features': [5000, 10000],
    'prep__tfidf__ngram_range': [(1, 1), (1, 2)],
    'prep__tfidf__min_df': [5],
    'prep__tfidf__max_df': [0.7],

    # Logistic hyperparams
    'clf__C': [0.1, 1.0, 10.0],
    'clf__class_weight': ['balanced', None]
}

# We care about F1 for the imbalanced classification
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=tss,
    scoring='f1',
    n_jobs=-1,           # parallelize if you want
    verbose=2
)

# ---------------------------------------------------
# 3. Run GridSearchCV on train+val
# ---------------------------------------------------

grid.fit(X_trainval, y_trainval)

print("Best params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

best_model = grid.best_estimator_

# ---------------------------------------------------
# 4. Evaluate best model on TEST
# ---------------------------------------------------

y_prob_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_prob_test >= 0.5).astype(int)

acc  = accuracy_score(y_test, y_pred_test)
f1   = f1_score(y_test, y_pred_test, zero_division=0)
prec = precision_score(y_test, y_pred_test, zero_division=0)
rec  = recall_score(y_test, y_pred_test, zero_division=0)
auc  = roc_auc_score(y_test, y_prob_test)

print("\n=== BEST GRID-SEARCH MODEL ON TEST ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1       : {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"AUC      : {auc:.4f}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END clf__C=0.1, clf__class_weight=balanced, prep__tfidf__max_df=0.7, prep__tfidf__max_features=5000, prep__tfidf__min_df=5, prep__tfidf__ngram_range=(1, 1); total time=   1.1s
[CV] END clf__C=0.1, clf__class_weight=balanced, prep__tfidf__max_df=0.7, prep__tfidf__max_features=5000, prep__tfidf__min_df=5, prep__tfidf__ngram_range=(1, 1); total time=   1.1s
[CV] END clf__C=0.1, clf__class_weight=balanced, prep__tfidf__max_df=0.7, prep__tfidf__max_features=5000, prep__tfidf__min_df=5, prep__tfidf__ngram_range=(1, 1); total time=   1.8s
[CV] END clf__C=0.1, clf__class_weight=balanced, prep__tfidf__max_df=0.7, prep__tfidf__max_features=10000, prep__tfidf__min_df=5, prep__tfidf__ngram_range=(1, 1); total time=   1.1s
[CV] END clf__C=0.1, clf__class_weight=balanced, prep__tfidf__max_df=0.7, prep__tfidf__max_features=5000, prep__tfidf__min_df=5, prep__tfidf__ngram_range=(1, 1); total time=   1.8s
[CV] END clf__C=0.1, clf__class_