In [8]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re
from pathlib import Path

# Paths
PANEL_CSV = "/Users/beszabo/bene/szakdolgozat/data/derived/company_weekly_panel_analysis_ready.csv"
DERIVED = Path("/Users/beszabo/bene/szakdolgozat/data/derived")

# Load panel
panel = pd.read_csv(PANEL_CSV, parse_dates=["week_start"])  # expects columns created earlier
panel["week_fe"] = panel["week_start"].dt.strftime("%Y-%W")  # weekly FE key

# How many rows survive the full spec (current + L1..L4 for both predictors)?
req = (["NYT_mention","nyt_sentiment"] +
       [f"NYT_mention_L{k}" for k in range(1,3)] +
       [f"nyt_sentiment_L{k}" for k in range(1,3)])
panel[req].notna().all(axis=1).sum()

np.int64(136)

In [9]:
# Helper to run TWFE OLS with clustered SEs

def run_twfe(
    df: pd.DataFrame,
    outcome: str,
    include_current: bool = True,
    lags: list[int] = [1, 2, 3, 4],
):
    # Select predictors that exist in the dataframe
    mention_vars = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in df.columns]
    sent_vars = [f"nyt_sentiment_L{k}" for k in lags if f"nyt_sentiment_L{k}" in df.columns]
    if include_current:
        if "NYT_mention" in df.columns:
            mention_vars = ["NYT_mention"] + mention_vars
        if "nyt_sentiment" in df.columns:
            sent_vars = ["nyt_sentiment"] + sent_vars
    X = mention_vars + sent_vars

    needed = [outcome, "company", "week_fe"] + X
    d = df.dropna(subset=needed).copy()
    if d.empty:
        print(f"[WARN] No rows left after dropping NaNs for {outcome}. Skipping.")
        return None

    formula = f"{outcome} ~ " + " + ".join(X + ["C(company)", "C(week_fe)"])
    print("\n===", outcome, "===")
    print("N before drop:", len(df), " N after drop:", len(d))
    print("Formula:", formula)

    model = smf.ols(formula, data=d).fit(cov_type="cluster", cov_kwds={"groups": d["company"]})
    print(model.summary())

    # Cumulative lag tests (only over lag terms, not including current)
    m_lags = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in d.columns]
    s_lags = [f"nyt_sentiment_L{k}" for k in lags if f"nyt_sentiment_L{k}" in d.columns]
    if m_lags:
        test_str = " + ".join(m_lags) + " = 0"
        print("\nSum of NYT_mention lags (L1..L4) == 0:")
        print(model.t_test(test_str))
    if s_lags:
        test_str = " + ".join(s_lags) + " = 0"
        print("\nSum of nyt_sentiment lags (L1..L4) == 0:")
        print(model.t_test(test_str))

    return model

In [10]:
# Run models
m1 = run_twfe(panel, outcome="log1p_meme_volume", include_current=True)
m2 = run_twfe(panel, outcome="mean_meme_sentiment", include_current=True)
m3 = run_twfe(panel, outcome="log1p_meme_engagement", include_current=True)



=== log1p_meme_volume ===
N before drop: 7350  N after drop: 70
Formula: log1p_meme_volume ~ NYT_mention + NYT_mention_L1 + NYT_mention_L2 + NYT_mention_L3 + NYT_mention_L4 + nyt_sentiment + nyt_sentiment_L1 + nyt_sentiment_L2 + nyt_sentiment_L3 + nyt_sentiment_L4 + C(company) + C(week_fe)
                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.838
Model:                            OLS   Adj. R-squared:                 -0.399
Method:                 Least Squares   F-statistic:                 4.734e+13
Date:                Tue, 11 Nov 2025   Prob (F-statistic):           2.11e-14
Time:                        08:37:07   Log-Likelihood:                 9.7099
No. Observations:                  70   AIC:                             104.6
Df Residuals:                       8   BIC:                             244.0
Df Model:                          61                                       



In [11]:

# Collect key coefficients (current + L1..L4) for plotting lag effects later
def collect_key_coefs(model, outcome_name: str):
    if model is None:
        return []
    rows = []
    params = model.params
    ses = model.bse
    pvals = model.pvalues

    for term in params.index:
        # match NYT_mention, NYT_mention_L#, nyt_sentiment, nyt_sentiment_L#
        m = re.fullmatch(r"(NYT_mention)(?:_L(\d+))?", term)
        s = re.fullmatch(r"(nyt_sentiment)(?:_L(\d+))?", term)
        if m:
            predictor = "NYT_mention"
            lag = int(m.group(2)) if m.group(2) else 0
        elif s:
            predictor = "nyt_sentiment"
            lag = int(s.group(2)) if s.group(2) else 0
        else:
            continue
        rows.append({
            "outcome": outcome_name,
            "predictor": predictor,
            "lag": lag,
            "coef": float(params[term]),
            "se": float(ses[term]),
            "pval": float(pvals[term]),
        })
    return rows

In [12]:
all_rows = []
all_rows += collect_key_coefs(m1, "log1p_meme_volume")
all_rows += collect_key_coefs(m2, "mean_meme_sentiment")
all_rows += collect_key_coefs(m3, "log1p_meme_engagement")

coefs_df = pd.DataFrame(all_rows)
coefs_df = coefs_df.sort_values(["outcome", "predictor", "lag"]).reset_index(drop=True)

out_path = DERIVED / "twfe_key_coefficients.csv"
coefs_df.to_csv(out_path, index=False)
print(f"Saved key coefficients to: {out_path}")
print(coefs_df.head(12).to_string(index=False))


Saved key coefficients to: /Users/beszabo/bene/szakdolgozat/data/derived/twfe_key_coefficients.csv
              outcome     predictor  lag      coef       se         pval
log1p_meme_engagement   NYT_mention    0  0.015932 0.001019 4.133182e-55
log1p_meme_engagement   NYT_mention    1  0.020949 0.004953 2.336728e-05
log1p_meme_engagement   NYT_mention    2  0.005767 0.001474 9.119004e-05
log1p_meme_engagement   NYT_mention    3  0.006413 0.012318 6.026079e-01
log1p_meme_engagement   NYT_mention    4  0.000020 0.006152 9.974465e-01
log1p_meme_engagement nyt_sentiment    0  0.656594 0.262941 1.252064e-02
log1p_meme_engagement nyt_sentiment    1  0.566073 0.216980 9.084125e-03
log1p_meme_engagement nyt_sentiment    2  0.520279 0.278673 6.190347e-02
log1p_meme_engagement nyt_sentiment    3 -0.466572 0.057098 3.047728e-16
log1p_meme_engagement nyt_sentiment    4  0.295125 0.451189 5.130439e-01
    log1p_meme_volume   NYT_mention    0  0.034475 0.000238 0.000000e+00
    log1p_meme_volume   N

In [13]:
for name, m in [("volume", m1), ("sentiment", m2), ("engagement", m3)]:
    print(f"\n[{name}] NYT terms only")
    print(m.params[[k for k in m.params.index if "NYT_mention" in k or "nyt_sentiment" in k]])


[volume] NYT terms only
NYT_mention         0.034475
NYT_mention_L1      0.114512
NYT_mention_L2      0.089502
NYT_mention_L3      0.100009
NYT_mention_L4      0.093345
nyt_sentiment       1.128443
nyt_sentiment_L1   -0.192764
nyt_sentiment_L2    0.648876
nyt_sentiment_L3    0.871976
nyt_sentiment_L4    1.661367
dtype: float64

[sentiment] NYT terms only
NYT_mention        -0.007637
NYT_mention_L1     -0.008896
NYT_mention_L2      0.003899
NYT_mention_L3     -0.011365
NYT_mention_L4     -0.003684
nyt_sentiment      -0.435640
nyt_sentiment_L1   -0.055572
nyt_sentiment_L2    0.236739
nyt_sentiment_L3   -0.002091
nyt_sentiment_L4   -0.056436
dtype: float64

[engagement] NYT terms only
NYT_mention         0.015932
NYT_mention_L1      0.020949
NYT_mention_L2      0.005767
NYT_mention_L3      0.006413
NYT_mention_L4      0.000020
nyt_sentiment       0.656594
nyt_sentiment_L1    0.566073
nyt_sentiment_L2    0.520279
nyt_sentiment_L3   -0.466572
nyt_sentiment_L4    0.295125
dtype: float64


In [15]:
# Mention-only TWFE models (no sentiment terms), brand and week fixed effects remain

def run_twfe_mention_only(
    df: pd.DataFrame,
    outcome: str,
    include_current: bool = True,
    lags: list[int] = [1, 2, 3, 4],
):
    mention_vars = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in df.columns]
    if include_current and "NYT_mention" in df.columns:
        mention_vars = ["NYT_mention"] + mention_vars
    X = mention_vars

    needed = [outcome, "company", "week_fe"] + X
    d = df.dropna(subset=needed).copy()
    if d.empty:
        print(f"[WARN] No rows left after dropping NaNs for {outcome} (mention-only). Skipping.")
        return None

    formula = f"{outcome} ~ " + " + ".join(X + ["C(company)", "C(week_fe)"])
    print("\n=== Mention-only:", outcome, "===")
    print("N before drop:", len(df), " N after drop:", len(d))
    print("Formula:", formula)

    model = smf.ols(formula, data=d).fit(cov_type="cluster", cov_kwds={"groups": d["company"]})
    print(model.summary())
    
    # Cumulative lag test for mentions
    m_lags = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in d.columns]
    if m_lags:
        test_str = " + ".join(m_lags) + " = 0"
        print("\nSum of NYT_mention lags (L1..L4) == 0:")
        print(model.t_test(test_str))

    return model

def export_mention_only_coefficients(
    df: pd.DataFrame,
    outcomes=("log1p_meme_volume", "mean_meme_sentiment", "log1p_meme_engagement"),
    lag_sets=([1, 2, 3, 4], [1, 2, 3], [1, 2], [1]),
    include_current: bool = True,
):
    results = {}
    for lags in lag_sets:
        models = {
            outcomes[0]: run_twfe_mention_only(df, outcome=outcomes[0], include_current=include_current, lags=lags),
            outcomes[1]: run_twfe_mention_only(df, outcome=outcomes[1], include_current=include_current, lags=lags),
            outcomes[2]: run_twfe_mention_only(df, outcome=outcomes[2], include_current=include_current, lags=lags),
        }
        rows = []
        for outcome_name, model in models.items():
            rows += collect_key_coefs(model, outcome_name)
        coefs = pd.DataFrame(rows).sort_values(["outcome", "predictor", "lag"]).reset_index(drop=True)
        suffix = f"{len(lags)}_lags"
        out_path = DERIVED / f"twfe_key_coefficients_mention_only_{suffix}.csv"
        coefs.to_csv(out_path, index=False)
        print(f"Saved mention-only key coefficients to: {out_path}")
        print(coefs.head(12).to_string(index=False))
        results[suffix] = coefs
    return results

# Run and export 
_ = export_mention_only_coefficients(panel)



=== Mention-only: log1p_meme_volume ===
N before drop: 7350  N after drop: 7070
Formula: log1p_meme_volume ~ NYT_mention + NYT_mention_L1 + NYT_mention_L2 + NYT_mention_L3 + NYT_mention_L4 + C(company) + C(week_fe)
                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.625
Model:                            OLS   Adj. R-squared:                  0.616
Method:                 Least Squares   F-statistic:                 6.478e+09
Date:                Tue, 11 Nov 2025   Prob (F-statistic):          6.99e-317
Time:                        08:38:20   Log-Likelihood:                 1753.6
No. Observations:                7070   AIC:                            -3157.
Df Residuals:                    6895   BIC:                            -1956.
Df Model:                         174                                         
Covariance Type:              cluster                                    



                             Test for Constraints                             
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.0078      0.001      5.238      0.000       0.005       0.011

=== Mention-only: mean_meme_sentiment ===
N before drop: 7350  N after drop: 498
Formula: mean_meme_sentiment ~ NYT_mention + NYT_mention_L1 + NYT_mention_L2 + NYT_mention_L3 + NYT_mention_L4 + C(company) + C(week_fe)
                             OLS Regression Results                            
Dep. Variable:     mean_meme_sentiment   R-squared:                       0.381
Model:                             OLS   Adj. R-squared:                  0.131
Method:                  Least Squares   F-statistic:                     358.7
Date:                 Tue, 11 Nov 2025   Prob (F-statistic):           1.59e-39
Time:                         08:38:20   Log-Likelihood:         



                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.623
Model:                            OLS   Adj. R-squared:                  0.614
Method:                 Least Squares   F-statistic:                -5.116e+09
Date:                Tue, 11 Nov 2025   Prob (F-statistic):               1.00
Time:                        08:38:21   Log-Likelihood:                 1758.2
No. Observations:                7140   AIC:                            -3166.
Df Residuals:                    6965   BIC:                            -1964.
Df Model:                         174                                         
Covariance Type:              cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     



                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.623
Model:                            OLS   Adj. R-squared:                  0.614
Method:                 Least Squares   F-statistic:                 2.176e+12
Date:                Tue, 11 Nov 2025   Prob (F-statistic):               0.00
Time:                        08:38:21   Log-Likelihood:                 1801.8
No. Observations:                7210   AIC:                            -3254.
Df Residuals:                    7035   BIC:                            -2049.
Df Model:                         174                                         
Covariance Type:              cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     

