# Lead–Lag Analysis (clean workflow)

This notebook loads the balanced weekly panel and produces:

- Descriptive brand time-series and cross-correlations
- Event-study plots (with 95% CIs)
- Optional diagnostics (non-overlap windows, alignment shifts, week-demeaned)
- Placebo and Event−Placebo difference plots

Reusable helpers (plotting, event selection) live in `code/lead_lag_analysis.py`. The notebook focuses on orchestration and visualization.


In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lead_lag_analysis import (
    add_normalizations, event_study_value,
    xcorr_by_company, plot_xcorr, event_study, plot_event,
    plot_event_ci, plot_brand_timeseries, sanitize_filename,
    event_study_from_indices, build_event_dict, plot_diff_ci,
)


In [2]:
REPO_ROOT = "/Users/beszabo/bene/szakdolgozat"
DERIVED_DIR = os.path.join(REPO_ROOT, "data", "panels")
FIG_DIR = os.path.join(REPO_ROOT, "figures")

# Use balanced, calendar-complete panel for event study
PANEL_CSV = os.path.join(DERIVED_DIR, "company_weekly_panel_analysis_ready.csv")

panel = pd.read_csv(PANEL_CSV, parse_dates=['week_start'])

# Map balanced panel column names back to legacy names used in downstream code
alias_map = {
    'nyt_pos_share': 'mean_pos',
    'nyt_neg_share': 'mean_neg',
    'nyt_sentiment': 'sentiment_score',
    'nyt_non_neutral_share': 'non_neutral_share',
}
for src, dst in alias_map.items():
    if src in panel.columns and dst not in panel.columns:
        panel[dst] = panel[src]
# Alias for old 'num_articles' usages
if 'num_articles' not in panel.columns and 'NYT_mention' in panel.columns:
    panel['num_articles'] = panel['NYT_mention']

# Treat truly unobserved meme weeks as missing (NaN) for outcome averaging
if 'num_memes' in panel.columns:
    if 'mean_meme_sentiment' in panel.columns:
        missing_memes = (panel['num_memes'] == 0) & (panel['mean_meme_sentiment'].isna())
    else:
        missing_memes = (panel['num_memes'] == 0)
    panel['num_memes_na'] = panel['num_memes'].astype(float)
    panel.loc[missing_memes, 'num_memes_na'] = np.nan
    # NaN-aware z-score per company (computed on observed weeks only)
    stats = (panel.loc[~missing_memes]
                  .groupby('company')['num_memes']
                  .agg(mu='mean', sd='std')
                  .reset_index())
    panel = panel.merge(stats, on='company', how='left')
    panel['sd'] = panel['sd'].replace(0, np.nan)
    panel['num_memes_z_es'] = (panel['num_memes_na'] - panel['mu']) / panel['sd']
    # For compatibility with plotting/xcorr helpers
    panel['num_memes_z'] = panel['num_memes_z_es']

# Event study on NaN-aware z outcome, using balanced panel indexing
pos_z, neg_z = event_study_value(panel,
                                 pos_feature='mean_pos',
                                 neg_feature='mean_neg',
                                 value_col='num_memes_z_es',
                                 window=3)

os.makedirs(FIG_DIR, exist_ok=True)

In [3]:
# Descriptive brand time-series and event-study with 95% CIs

# Select a few brands (top by NYT articles)
top_companies = (
    panel.groupby('company')['NYT_mention']
            .sum()
            .sort_values(ascending=False)
            .head(10)
            .index
            .tolist()
)

for c in top_companies:
    out = os.path.join(FIG_DIR, f"ts_{sanitize_filename(c)}_num_articles_vs_num_memes_z.png")
    plot_brand_timeseries(panel, c, left_col='NYT_mention', right_col='num_memes_z', smooth=4, out_path=out)
    print(f"Saved {out}")

# Event-study plots with 95% confidence intervals (NaN-aware z outcome)
plot_event_ci(pos_z, 'num_memes_z_es',
              'Event: Positive news vs normalized (z) meme volume (95% CI)',
              os.path.join(FIG_DIR, 'event_pos_num_memes_z_ci.png'))
plot_event_ci(neg_z, 'num_memes_z_es',
              'Event: Negative news vs normalized (z) meme volume (95% CI)',
              os.path.join(FIG_DIR, 'event_neg_num_memes_z_ci.png'))
print('Saved CI event-study plots.')


Saved /Users/beszabo/bene/szakdolgozat/figures/ts_facebook_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_youtube_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_netflix_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_google_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_instagram_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_apple_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_microsoft_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_amazon_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_tesla_num_articles_vs_num_memes_z.png
Saved /Users/beszabo/bene/szakdolgozat/figures/ts_spotify_num_articles_vs_num_memes_z.png
Saved CI event-study plots.


# Sanity checks

In [4]:
# Sanity checks: schema, missingness, continuity

required = ['company','week_start','NYT_mention','num_memes']
optional = ['meme_engagement','sentiment_score','mean_pos','mean_neg','non_neutral_share']
print('Columns present:', sorted(panel.columns.tolist())[:20], '...')
missing_required = [c for c in required if c not in panel.columns]
assert not missing_required, f"Missing required cols: {missing_required}"

# Basic coverage
print('Date range:', panel['week_start'].min(), '→', panel['week_start'].max())
for c in required + optional:
    if c in panel.columns:
        nn = panel[c].notna().mean()
        print(f"Non-null share {c}: {nn:.3f}")

# NYT_mention continuity: share of weeks with >0 mentions by year
panel['year'] = panel['week_start'].dt.year
nyt_by_year = (panel.groupby(['company','year'])['NYT_mention']
                    .apply(lambda s: (s.fillna(0) > 0).mean())
                    .rename('share_weeks_with_mentions')
                    .reset_index())
print('\nExample continuity (top 5 companies by mentions):')
top5 = (panel.groupby('company')['NYT_mention'].sum().sort_values(ascending=False).head(5).index)
print(nyt_by_year[nyt_by_year['company'].isin(top5)].pivot(index='company', columns='year', values='share_weeks_with_mentions').fillna(0.0).round(2))


Columns present: ['NYT_mention', 'NYT_mention_L1', 'NYT_mention_L2', 'NYT_mention_L3', 'NYT_mention_L4', 'company', 'iso_week', 'iso_year', 'log1p_meme_engagement', 'log1p_meme_volume', 'mean_meme_sentiment', 'mean_meme_sentiment_L1', 'mean_meme_sentiment_L2', 'mean_meme_sentiment_L3', 'mean_meme_sentiment_L4', 'mean_neg', 'mean_pos', 'meme_engagement', 'meme_engagement_L1', 'meme_engagement_L2'] ...
Date range: 2022-12-26 00:00:00 → 2024-12-30 00:00:00
Non-null share company: 1.000
Non-null share week_start: 1.000
Non-null share NYT_mention: 1.000
Non-null share num_memes: 1.000
Non-null share meme_engagement: 0.110
Non-null share sentiment_score: 0.443
Non-null share mean_pos: 0.443
Non-null share mean_neg: 0.443
Non-null share non_neutral_share: 0.443

Example continuity (top 5 companies by mentions):
year       2022  2023  2024
company                    
facebook    1.0   1.0  0.09
google      1.0   1.0  0.09
instagram   1.0   1.0  0.09
netflix     1.0   1.0  1.00
youtube     1.0 

In [5]:
# Sanity checks: alternative outcomes (num_memes_rel, log1p metrics)

# Ensure alt outcomes exist
panel['log1p_meme_volume'] = np.log1p(panel['num_memes'].fillna(0))
if 'meme_engagement' in panel.columns:
    panel['log1p_meme_engagement'] = np.log1p(panel['meme_engagement'].fillna(0))

# num_memes_rel is provided by add_normalizations(panel)
pos_rel, neg_rel = event_study_value(panel, pos_feature='mean_pos', neg_feature='mean_neg', value_col='num_memes_rel', window=3)
plot_event_ci(pos_rel, 'num_memes_rel',
              'Event: Positive news vs relative meme volume (95% CI)',
              os.path.join(FIG_DIR, 'event_pos_num_memes_rel_ci.png'))
plot_event_ci(neg_rel, 'num_memes_rel',
              'Event: Negative news vs relative meme volume (95% CI)',
              os.path.join(FIG_DIR, 'event_neg_num_memes_rel_ci.png'))

# log1p outcomes
pos_logv, neg_logv = event_study_value(panel, pos_feature='mean_pos', neg_feature='mean_neg', value_col='log1p_meme_volume', window=3)
plot_event_ci(pos_logv, 'log1p_meme_volume',
              'Event: Positive news vs log1p meme volume (95% CI)',
              os.path.join(FIG_DIR, 'event_pos_log1p_meme_volume_ci.png'))
plot_event_ci(neg_logv, 'log1p_meme_volume',
              'Event: Negative news vs log1p meme volume (95% CI)',
              os.path.join(FIG_DIR, 'event_neg_log1p_meme_volume_ci.png'))

if 'log1p_meme_engagement' in panel.columns:
    pos_loge, neg_loge = event_study_value(panel, pos_feature='mean_pos', neg_feature='mean_neg', value_col='log1p_meme_engagement', window=3)
    plot_event_ci(pos_loge, 'log1p_meme_engagement',
                  'Event: Positive news vs log1p meme engagement (95% CI)',
                  os.path.join(FIG_DIR, 'event_pos_log1p_meme_engagement_ci.png'))
    plot_event_ci(neg_loge, 'log1p_meme_engagement',
                  'Event: Negative news vs log1p meme engagement (95% CI)',
                  os.path.join(FIG_DIR, 'event_neg_log1p_meme_engagement_ci.png'))

print('Saved alternative outcome CI event-study plots.')


Saved alternative outcome CI event-study plots.


# No overlap tests

In [6]:
# Diagnostics: event selection, overlap filtering, alignment tests, week-demeaned outcomes

# 1) Non-overlapping event windows
win = 3
pos_events_noov = build_event_dict(panel, 'mean_pos', q=0.90, window=win, non_overlapping=True)
neg_events_noov = build_event_dict(panel, 'mean_neg', q=0.90, window=win, non_overlapping=True)

pos_noov = event_study_from_indices(panel, pos_events_noov, 'num_memes_z', window=win)
neg_noov = event_study_from_indices(panel, neg_events_noov, 'num_memes_z', window=win)

plot_event_ci(
    pos_noov,
    'num_memes_z',
    'Event (no-overlap): Positive news vs num_memes_z',
    os.path.join(FIG_DIR, 'event_pos_num_memes_z_ci_nooverlap.png'),
)
plot_event_ci(
    neg_noov,
    'num_memes_z',
    'Event (no-overlap): Negative news vs num_memes_z',
    os.path.join(FIG_DIR, 'event_neg_num_memes_z_ci_nooverlap.png'),
)

print('Events kept (pos/neg):', sum(len(v) for v in pos_events_noov.values()), sum(len(v) for v in neg_events_noov.values()))

# 2) Alignment shift tests (shift events by -1 and +1 week)
pos_events_m1 = build_event_dict(panel, 'mean_pos', q=0.90, window=win, non_overlapping=True, shift=-1)
pos_events_p1 = build_event_dict(panel, 'mean_pos', q=0.90, window=win, non_overlapping=True, shift=+1)

for lab, ev in [('shift_m1', pos_events_m1), ('shift_p1', pos_events_p1)]:
    dfv = event_study_from_indices(panel, ev, 'num_memes_z', window=win)
    out = os.path.join(FIG_DIR, f'event_pos_num_memes_z_ci_{lab}.png')
    plot_event_ci(dfv, 'num_memes_z', f'Event (pos, {lab}): num_memes_z', out)
    # quick stat: mean at tau==0
    if not dfv.empty:
        print(lab, 'tau0 mean=', dfv[dfv['tau']==0]['num_memes_z'].mean())

# 3) Week-demeaned outcomes
wk_mean = panel.groupby('week_start')['num_memes_z'].transform('mean')
panel['num_memes_z_dm'] = panel['num_memes_z'] - wk_mean
pos_dm = event_study_from_indices(panel, pos_events_noov, 'num_memes_z_dm', window=win)
neg_dm = event_study_from_indices(panel, neg_events_noov, 'num_memes_z_dm', window=win)
plot_event_ci(pos_dm, 'num_memes_z_dm', 'Event (demeaned): Positive news vs num_memes_z', os.path.join(FIG_DIR, 'event_pos_num_memes_z_demeaned_ci.png'))
plot_event_ci(neg_dm, 'num_memes_z_dm', 'Event (demeaned): Negative news vs num_memes_z', os.path.join(FIG_DIR, 'event_neg_num_memes_z_demeaned_ci.png'))

print('Saved no-overlap, shift tests, and demeaned-week diagnostics.')


Events kept (pos/neg): 350 341
shift_m1 tau0 mean= 0.11333637073035413
shift_p1 tau0 mean= -0.06032481282113015
Saved no-overlap, shift tests, and demeaned-week diagnostics.


In [8]:
# Diagnostics: use NYT_mention spikes as events, split by tone at event week
win = 3
# Event definition: top 10% NYT_mention per company, non-overlapping
mention_events = build_event_dict(panel, 'NYT_mention', q=0.90, window=win, non_overlapping=True)

# Unconditional on tone
m_ev = event_study_from_indices(panel, mention_events, 'num_memes_z', window=win)
plot_event_ci(m_ev, 'num_memes_z', 'Event (mentions spikes): num_memes_z', os.path.join(FIG_DIR, 'event_mentions_num_memes_z_ci.png'))

# Split by tone sign at event week (sentiment_score >= 0 vs < 0)
pos_split: dict[str, list[int]] = {}
neg_split: dict[str, list[int]] = {}
for company, g in panel.groupby('company'):
    g = g.sort_values('week_start').reset_index(drop=True)
    idxs = mention_events.get(company, [])
    pos_idx = [i for i in idxs if pd.notna(g.loc[i, 'sentiment_score']) and g.loc[i, 'sentiment_score'] >= 0]
    neg_idx = [i for i in idxs if pd.notna(g.loc[i, 'sentiment_score']) and g.loc[i, 'sentiment_score'] < 0]
    pos_split[company] = pos_idx
    neg_split[company] = neg_idx

m_pos = event_study_from_indices(panel, pos_split, 'num_memes_z', window=win)
m_neg = event_study_from_indices(panel, neg_split, 'num_memes_z', window=win)
plot_event_ci(m_pos, 'num_memes_z', 'Event (mentions spikes, pos tone): num_memes_z', os.path.join(FIG_DIR, 'event_mentions_pos_num_memes_z_ci.png'))
plot_event_ci(m_neg, 'num_memes_z', 'Event (mentions spikes, neg tone): num_memes_z', os.path.join(FIG_DIR, 'event_mentions_neg_num_memes_z_ci.png'))

print('Saved NYT_mention spike-based event-study plots (overall and tone-split).')


Saved NYT_mention spike-based event-study plots (overall and tone-split).


In [9]:
# Sanity checks: create lags and updated CCF using NYT_mention

# Create L1..L4 for selected predictors (if not already present)
lag_sources = [c for c in ['NYT_mention','sentiment_score','mean_pos','mean_neg','non_neutral_share'] if c in panel.columns]
panel = panel.sort_values(['company','week_start']).copy()
for c in lag_sources:
    for k in range(1, 5):
        col = f"{c}_L{k}"
        if col not in panel.columns:
            panel[col] = panel.groupby('company')[c].shift(k)

# Recompute CCF for NYT_mention specifically (vs num_memes_z)
df_ccf = xcorr_by_company(panel, 'NYT_mention', max_lag=4)
out_ccf = os.path.join(FIG_DIR, 'xcorr_NYT_mention_vs_num_memes_z.png')
plot_xcorr(df_ccf, 'Lead-Lag: NYT_mention vs num_memes_z', out_ccf)
print('Saved', out_ccf)


Saved /Users/beszabo/bene/szakdolgozat/figures/xcorr_NYT_mention_vs_num_memes_z.png


# Placebo

In [10]:
# Placebo test: match on NYT_mention deciles within company

import numpy as np

rng = np.random.default_rng(42)

# Helper: compute deciles within company using rank to avoid ties issues
def _company_deciles(s: pd.Series, q: int = 10) -> pd.Series:
    r = s.rank(method='first')
    try:
        return pd.qcut(r, q, labels=False, duplicates='drop')
    except Exception:
        # Fallback to single bin if not enough unique values
        return pd.Series(0, index=s.index)

# Build event indices dict (non-overlapping) using legacy feature names (aliased earlier)
win = 3
pos_events = {}
neg_events = {}
for company, g in panel.groupby('company'):
    g = g.sort_values('week_start').reset_index(drop=True)
    if g.empty:
        pos_events[company] = []
        neg_events[company] = []
        continue
    pos_thr = g['mean_pos'].quantile(0.90) if 'mean_pos' in g.columns else np.nan
    neg_thr = g['mean_neg'].quantile(0.90) if 'mean_neg' in g.columns else np.nan
    pos_idx = [i for i in range(len(g)) if pd.notna(g.loc[i,'mean_pos']) and g.loc[i,'mean_pos'] >= pos_thr]
    neg_idx = [i for i in range(len(g)) if pd.notna(g.loc[i,'mean_neg']) and g.loc[i,'mean_neg'] >= neg_thr]
    # enforce non-overlap (min gap == window)
    def _keep_no_overlap(idxs):
        kept, last = [], -10_000
        for i in sorted(idxs):
            if i - last > win:
                kept.append(i); last = i
        return kept
    pos_events[company] = _keep_no_overlap(pos_idx)
    neg_events[company] = _keep_no_overlap(neg_idx)

# Precompute deciles per company on NYT_mention
panel = panel.sort_values(['company','week_start']).reset_index(drop=True)
panel['nyt_decile'] = panel.groupby('company')['NYT_mention'].transform(_company_deciles)

# Placebo event-study builder

def placebo_from_events(panel: pd.DataFrame, events: dict, value_col: str, window: int) -> pd.DataFrame:
    rows = []
    for company, g in panel.groupby('company'):
        g = g.sort_values('week_start').reset_index(drop=True)
        n = len(g)
        if n == 0:
            continue
        dec = g['nyt_decile'] if 'nyt_decile' in g.columns else pd.Series(0, index=g.index)
        idxs = events.get(company, [])
        for i in idxs:
            d = int(dec.iloc[i]) if pd.notna(dec.iloc[i]) else 0
            candidates = [j for j in range(n) if (j != i) and (abs(j - i) > window) and (int(dec.iloc[j]) == d if pd.notna(dec.iloc[j]) else False)]
            if not candidates:
                candidates = [j for j in range(n) if abs(j - i) > window]
            if not candidates:
                continue
            p = int(rng.choice(candidates))
            for tau in range(-window, window+1):
                j = p + tau
                if 0 <= j < n and pd.notna(g.loc[j, value_col]):
                    rows.append({'company': company, 'tau': tau, value_col: float(g.loc[j, value_col])})
    return pd.DataFrame(rows)

# Build placebo windows matched on NYT_mention deciles
pos_placebo = placebo_from_events(panel, pos_events, value_col='num_memes_z_es', window=win)
neg_placebo = placebo_from_events(panel, neg_events, value_col='num_memes_z_es', window=win)

# Plot and quick comparison at tau=0
plot_event_ci(pos_placebo, 'num_memes_z_es', 'Placebo (pos events, matched by NYT decile)', os.path.join(FIG_DIR, 'event_pos_num_memes_z_placebo_ci.png'))
plot_event_ci(neg_placebo, 'num_memes_z_es', 'Placebo (neg events, matched by NYT decile)', os.path.join(FIG_DIR, 'event_neg_num_memes_z_placebo_ci.png'))

if not pos_placebo.empty:
    print('Placebo pos tau0 mean =', float(pos_placebo[pos_placebo['tau']==0]['num_memes_z_es'].mean()))
if not neg_placebo.empty:
    print('Placebo neg tau0 mean =', float(neg_placebo[neg_placebo['tau']==0]['num_memes_z_es'].mean()))


Placebo pos tau0 mean = -0.2528264676987362
Placebo neg tau0 mean = -0.06824508001727238


In [11]:
# Event − Placebo difference CI plots (main results)

res_dir = os.path.join(FIG_DIR, 'results', 'event_diff')
os.makedirs(res_dir, exist_ok=True)

plot_diff_ci(
    pos_z, pos_placebo, 'num_memes_z_es',
    'Event − Placebo (pos): normalized (z) meme volume (95% CI)',
    os.path.join(res_dir, 'event_pos_num_memes_z_diff_ci.png'),
)

plot_diff_ci(
    neg_z, neg_placebo, 'num_memes_z_es',
    'Event − Placebo (neg): normalized (z) meme volume (95% CI)',
    os.path.join(res_dir, 'event_neg_num_memes_z_diff_ci.png'),
)
print('Saved Event − Placebo difference plots to', res_dir)


Saved Event − Placebo difference plots to /Users/beszabo/bene/szakdolgozat/figures/results/event_diff
