In [135]:
import pandas as pd
import warnings
from __future__ import annotations
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from arch.univariate.base import DataScaleWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DataScaleWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [136]:
from typing import Optional
import pandas as pd
import numpy as np
from zoneinfo import ZoneInfo
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
import statsmodels.api as sm

###############################################################################
# -----------------------------  CONFIG  --------------------------------------
###############################################################################
MARKET_TZ       = ZoneInfo('US/Eastern')
TRADING_START   = '09:30'
TRADING_END     = '16:00'
TOPIC_THRESH    = 0.80
ARIMA_ORDER     = (1, 0, 0)        # AR(1)
KERNEL          = np.array([1.0])  # single‑minute impulse
RET_SCALE = 1000        # 1 ≤ RET_SCALE ≤ 1000 recommended by arch
###############################################################################

# ────────────────────────  helpers  ───────────────────────────────────────────
def to_market_time(utc_ts: pd.Timestamp) -> pd.Timestamp:
    """UTC → naive NY   (tz info stripped for speed)."""
    return utc_ts.tz_convert(MARKET_TZ).tz_localize(None)

def first_session_minute(ts_naive: pd.Timestamp,
                         price_index: pd.DatetimeIndex) -> Optional[pd.Timestamp]:
    """Map a timestamp to first trading minute at/after ts in price_index."""
    if ts_naive > price_index[-1]:
        return None
    if ts_naive in price_index:
        return ts_naive
    pos = price_index.get_indexer([ts_naive], method='bfill')[0]
    return price_index[pos] if pos != -1 else None

def prepare_price_df(raw: pd.DataFrame) -> pd.Series:
    """Return minutely log‑returns on a *regular* grid (NaNs for gaps)."""
    ts = (pd.to_datetime(raw['timestamp'])
            .dt.tz_localize(MARKET_TZ, nonexistent='shift_forward',
                            ambiguous='NaT')
            .dt.tz_localize(None))

    px = (raw.assign(timestamp=ts)
              .set_index('timestamp')
              .between_time(TRADING_START, TRADING_END)
              ['avg_price']
              .resample('1min')          # regular grid
              .last()
              .asfreq('1min')            # keep NaNs instead of dropping minutes
           )

    # convert to returns, drop NaNs from both resample and diff
    returns = np.log(px).diff().dropna()

    return returns

def fit_baseline_garch(returns: pd.Series) -> pd.Series:
    """ARIMA(1) + GARCH(1,1) → squared standardised residuals."""
    ar_res   = ARIMA(returns, order=ARIMA_ORDER, trend='n').fit()
    resid    = ar_res.resid
    garch    = arch_model(resid, mean='Zero', vol='Garch', p=1, q=1).fit(disp='off')
    return garch.std_resid ** 2

def impulse_series(event_ts: pd.Timestamp,
                   base_index: pd.DatetimeIndex,
                   kernel: np.ndarray = KERNEL) -> pd.Series:
    """Series aligned to base_index, populated with kernel starting at event_ts."""
    s = pd.Series(0.0, index=base_index, name='impulse')
    if event_ts in s.index:
        start = s.index.get_loc(event_ts)
        end   = min(start + len(kernel), len(s))
        s.iloc[start:end] = kernel[:end - start]
    return s

# ──────────────────────  MAIN FUNCTION  ──────────────────────────────────────
def run_event_study(tweets_df: pd.DataFrame,
                    prices_df: pd.DataFrame,
                    tickers_topics_df: pd.DataFrame,
                    ticker_col_guess: list[str] = ('ticker', 'sym_root', 'symbol')
                   ) -> pd.DataFrame:
    """
    Parameters
    ----------
    tweets_df            : tweets + topic columns (FinBERT, zero‑shot, etc.)
    prices_df            : *single big* TAQ‑style table with at least
                           ['timestamp', 'avg_price', <ticker column>]
    tickers_topics_df    : mapping table with columns ['ticker', 'topic']
    """
    # 1) detect which column holds the ticker symbol
    ticker_col = next((c for c in ticker_col_guess if c in prices_df.columns), None)
    if ticker_col is None:
        raise ValueError('Could not find a ticker column in prices_df.')

    # 2) build dict {topic: [tickers]} once
    topic_to_tickers = (tickers_topics_df.groupby('topic')['ticker']
                                        .apply(list)
                                        .to_dict())
    topic_cols = [c for c in tweets_df.columns if c in topic_to_tickers]

    results = []

    # 3) iterate over *each ticker* present in prices_df
    for ticker, raw_px in prices_df.groupby(ticker_col, sort=False):
        returns = prepare_price_df(raw_px)
        if returns.empty:
            continue
        returns_scaled = returns * RET_SCALE
        price_idx   = returns.index
        std_resid2  = fit_baseline_garch(returns)

        # select tweets whose high‑score topics map to THIS ticker
        def tweet_relevant(row) -> bool:
            high_topics = [t for t in topic_cols if row[t] >= TOPIC_THRESH]
            return any(ticker in topic_to_tickers.get(t, []) for t in high_topics)

        for _, tw in tweets_df.loc[tweets_df.apply(tweet_relevant, axis=1)].iterrows():
            evt_ts = first_session_minute(
                to_market_time(pd.to_datetime(tw['timestamp'])),
                price_idx
            )
            if evt_ts is None:
                continue

            imp = impulse_series(evt_ts, std_resid2.index)
            df_reg = pd.concat({'y': std_resid2, 'impulse': imp}, axis=1).dropna()

            X   = sm.add_constant(df_reg['impulse'])
            ols = sm.OLS(df_reg['y'], X).fit()

            results.append({
                'handle'  : tw['handle'],
                'tweet_id': tw['id'],
                'ticker'  : ticker,
                'gamma'   : ols.params['impulse'],
                'pvalue'  : ols.pvalues['impulse']
            })

    return pd.DataFrame(results)


In [137]:
# --- 1. tweets ---------------------------------------------------------------
# tweets_df = pd.read_parquet('tweets.parquet')           # already has topic columns
tweets_df = pd.read_csv('../data/tweets_with_sentiment_and_topic.csv')

# --- 2. prices ---------------------------------------------------------------
prices_df = pd.read_parquet(
    '../data/taq/',
    engine='pyarrow',
    filters=[
        ('SYM_ROOT', '==', 'TSLA')
        # ('year', '==', 2024)
    ]
)
prices_df.columns = ['timestamp', 'avg_price', 'ticker', 'year']
del prices_df['year']

# --- 3. ticker‑to‑topic map --------------------------------------------------
tickers_topics_df = pd.read_csv('../data/05_people_stock_link_simplified.csv', sep=';')   # columns: ticker,topic
tickers_topics_df = tickers_topics_df[['industry', 'ticker']]
tickers_topics_df.columns = ['topic', 'ticker']
# --- 4. run study ------------------------------------------------------------
result_df = run_event_study(tweets_df, prices_df, tickers_topics_df)

# e.g. keep only strongly significant results
sig = result_df[result_df['pvalue'] < 0.05]
print(sig.sort_values('pvalue').head(20))


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



      handle             tweet_id ticker       gamma        pvalue
9   elonmusk  1146392791062212608   TSLA  150.137362  9.571517e-32
14  elonmusk  1130200352576303106   TSLA   51.933704  5.004929e-05
10  elonmusk  1140413721631551489   TSLA   38.590675  2.582861e-03
18  elonmusk  1113581319890526208   TSLA   32.749283  1.054827e-02
