In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re
from pathlib import Path

# Paths
PANEL_CSV = "/Users/beszabo/bene/szakdolgozat/data/derived/company_weekly_panel_analysis_ready.csv"
DERIVED = Path("/Users/beszabo/bene/szakdolgozat/data/derived")

# Load panel
panel = pd.read_csv(PANEL_CSV, parse_dates=["week_start"])  # expects columns created earlier
panel["week_fe"] = panel["week_start"].dt.strftime("%Y-%W")  # weekly FE key


In [2]:
# Helper to run TWFE OLS with clustered SEs

def run_twfe(
    df: pd.DataFrame,
    outcome: str,
    include_current: bool = True,
    lags: list[int] = [1, 2, 3, 4],
):
    for lags in ([1], [1,2], [1,2,3], [1,2,3,4]):
        need = [outcome, 'company', 'week_fe']             \
            + ['NYT_mention'] + [f'NYT_mention_L{k}' for k in lags] \
            + ['nyt_sentiment'] + [f'nyt_sentiment_L{k}' for k in lags]
        print("rows left after dropping NaNs for", outcome, "with", lags, "lags:", df.dropna(subset=need).shape[0])
    # Select predictors that exist in the dataframe
    mention_vars = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in df.columns]
    sent_vars = [f"nyt_sentiment_L{k}" for k in lags if f"nyt_sentiment_L{k}" in df.columns]
    if include_current:
        if "NYT_mention" in df.columns:
            mention_vars = ["NYT_mention"] + mention_vars
        if "nyt_sentiment" in df.columns:
            sent_vars = ["nyt_sentiment"] + sent_vars
    X = mention_vars + sent_vars

    needed = [outcome, "company", "week_fe"] + X
    d = df.dropna(subset=needed).copy()
    if d.empty:
        print(f"[WARN] No rows left after dropping NaNs for {outcome}. Skipping.")
        return None

    formula = f"{outcome} ~ " + " + ".join(X + ["C(company)", "C(week_fe)"])
    print("\n===", outcome, "===")
    print("N before drop:", len(df), " N after drop:", len(d))
    print("Formula:", formula)

    model = smf.ols(formula, data=d).fit(cov_type="cluster", cov_kwds={"groups": d["company"]})
    # print(model.summary())

    # Cumulative lag tests (only over lag terms, not including current)
    m_lags = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in d.columns]
    s_lags = [f"nyt_sentiment_L{k}" for k in lags if f"nyt_sentiment_L{k}" in d.columns]
    if m_lags:
        test_str = " + ".join(m_lags) + " = 0"
        # print("\nSum of NYT_mention lags (L1..L4) == 0:")
        # print(model.t_test(test_str))
    if s_lags:
        test_str = " + ".join(s_lags) + " = 0"
        # print("\nSum of nyt_sentiment lags (L1..L4) == 0:")
        # print(model.t_test(test_str))

    return model

In [3]:
# Run models
m1 = run_twfe(panel, outcome="log1p_meme_volume", include_current=True)
m2 = run_twfe(panel, outcome="mean_meme_sentiment", include_current=True)
m3 = run_twfe(panel, outcome="log1p_meme_engagement", include_current=True)


rows left after dropping NaNs for log1p_meme_volume with [1] lags: 3126
rows left after dropping NaNs for log1p_meme_volume with [1, 2] lags: 2653
rows left after dropping NaNs for log1p_meme_volume with [1, 2, 3] lags: 2327
rows left after dropping NaNs for log1p_meme_volume with [1, 2, 3, 4] lags: 2092

=== log1p_meme_volume ===
N before drop: 8904  N after drop: 2092
Formula: log1p_meme_volume ~ NYT_mention + NYT_mention_L1 + NYT_mention_L2 + NYT_mention_L3 + NYT_mention_L4 + nyt_sentiment + nyt_sentiment_L1 + nyt_sentiment_L2 + nyt_sentiment_L3 + nyt_sentiment_L4 + C(company) + C(week_fe)
rows left after dropping NaNs for mean_meme_sentiment with [1] lags: 467
rows left after dropping NaNs for mean_meme_sentiment with [1, 2] lags: 436
rows left after dropping NaNs for mean_meme_sentiment with [1, 2, 3] lags: 409
rows left after dropping NaNs for mean_meme_sentiment with [1, 2, 3, 4] lags: 400

=== mean_meme_sentiment ===
N before drop: 8904  N after drop: 400
Formula: mean_meme_sen

In [4]:

# Collect key coefficients (current + L1..L4) for plotting lag effects later
def collect_key_coefs(model, outcome_name: str):
    if model is None:
        return []
    rows = []
    params = model.params
    ses = model.bse
    pvals = model.pvalues

    for term in params.index:
        # match NYT_mention, NYT_mention_L#, nyt_sentiment, nyt_sentiment_L#
        m = re.fullmatch(r"(NYT_mention)(?:_L(\d+))?", term)
        s = re.fullmatch(r"(nyt_sentiment)(?:_L(\d+))?", term)
        if m:
            predictor = "NYT_mention"
            lag = int(m.group(2)) if m.group(2) else 0
        elif s:
            predictor = "nyt_sentiment"
            lag = int(s.group(2)) if s.group(2) else 0
        else:
            continue
        rows.append({
            "outcome": outcome_name,
            "predictor": predictor,
            "lag": lag,
            "coef": float(params[term]),
            "se": float(ses[term]),
            "pval": float(pvals[term]),
        })
    return rows

In [5]:
all_rows = []
all_rows += collect_key_coefs(m1, "log1p_meme_volume")
all_rows += collect_key_coefs(m2, "mean_meme_sentiment")
all_rows += collect_key_coefs(m3, "log1p_meme_engagement")

coefs_df = pd.DataFrame(all_rows)
coefs_df = coefs_df.sort_values(["outcome", "predictor", "lag"]).reset_index(drop=True)

out_path = DERIVED / "twfe_key_coefficients.csv"
coefs_df.to_csv(out_path, index=False)
print(f"Saved key coefficients to: {out_path}")
print(coefs_df.head(12).to_string(index=False))


Saved key coefficients to: /Users/beszabo/bene/szakdolgozat/data/derived/twfe_key_coefficients.csv
              outcome     predictor  lag      coef       se     pval
log1p_meme_engagement   NYT_mention    0 -0.002258 0.002255 0.316778
log1p_meme_engagement   NYT_mention    1  0.005281 0.002103 0.012021
log1p_meme_engagement   NYT_mention    2  0.002475 0.003235 0.444207
log1p_meme_engagement   NYT_mention    3 -0.003414 0.002035 0.093374
log1p_meme_engagement   NYT_mention    4 -0.001975 0.002222 0.374127
log1p_meme_engagement nyt_sentiment    0 -0.026857 0.038540 0.485898
log1p_meme_engagement nyt_sentiment    1  0.014677 0.033424 0.660580
log1p_meme_engagement nyt_sentiment    2 -0.006614 0.035041 0.850297
log1p_meme_engagement nyt_sentiment    3  0.026250 0.039026 0.501185
log1p_meme_engagement nyt_sentiment    4 -0.028737 0.031574 0.362734
    log1p_meme_volume   NYT_mention    0 -0.003908 0.002199 0.075478
    log1p_meme_volume   NYT_mention    1  0.004713 0.002001 0.018486


In [6]:
for name, m in [("volume", m1), ("sentiment", m2), ("engagement", m3)]:
    print(f"\n[{name}] NYT terms only")
    print(m.params[[k for k in m.params.index if "NYT_mention" in k or "nyt_sentiment" in k]])


[volume] NYT terms only
NYT_mention        -0.003908
NYT_mention_L1      0.004713
NYT_mention_L2      0.000896
NYT_mention_L3     -0.003459
NYT_mention_L4     -0.002415
nyt_sentiment       0.004828
nyt_sentiment_L1   -0.007046
nyt_sentiment_L2   -0.025823
nyt_sentiment_L3    0.032394
nyt_sentiment_L4   -0.024793
dtype: float64

[sentiment] NYT terms only
NYT_mention        -0.000513
NYT_mention_L1     -0.003047
NYT_mention_L2      0.000138
NYT_mention_L3     -0.002559
NYT_mention_L4     -0.000438
nyt_sentiment      -0.001225
nyt_sentiment_L1   -0.042196
nyt_sentiment_L2    0.109374
nyt_sentiment_L3    0.036393
nyt_sentiment_L4   -0.045704
dtype: float64

[engagement] NYT terms only
NYT_mention        -0.002258
NYT_mention_L1      0.005281
NYT_mention_L2      0.002475
NYT_mention_L3     -0.003414
NYT_mention_L4     -0.001975
nyt_sentiment      -0.026857
nyt_sentiment_L1    0.014677
nyt_sentiment_L2   -0.006614
nyt_sentiment_L3    0.026250
nyt_sentiment_L4   -0.028737
dtype: float64


In [7]:
# Mention-only TWFE models (no sentiment terms), brand and week fixed effects remain

def run_twfe_mention_only(
    df: pd.DataFrame,
    outcome: str,
    include_current: bool = True,
    lags: list[int] = [1, 2, 3, 4],
):
    mention_vars = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in df.columns]
    if include_current and "NYT_mention" in df.columns:
        mention_vars = ["NYT_mention"] + mention_vars
    X = mention_vars

    needed = [outcome, "company", "week_fe"] + X
    d = df.dropna(subset=needed).copy()
    if d.empty:
        print(f"[WARN] No rows left after dropping NaNs for {outcome} (mention-only). Skipping.")
        return None

    formula = f"{outcome} ~ " + " + ".join(X + ["C(company)", "C(week_fe)"])
    print("\n=== Mention-only:", outcome, "===")
    print("N before drop:", len(df), " N after drop:", len(d))
    print("Formula:", formula)

    model = smf.ols(formula, data=d).fit(cov_type="cluster", cov_kwds={"groups": d["company"]})
    print(model.summary())
    
    # Cumulative lag test for mentions
    m_lags = [f"NYT_mention_L{k}" for k in lags if f"NYT_mention_L{k}" in d.columns]
    if m_lags:
        test_str = " + ".join(m_lags) + " = 0"
        print("\nSum of NYT_mention lags (L1..L4) == 0:")
        print(model.t_test(test_str))

    return model

def export_mention_only_coefficients(
    df: pd.DataFrame,
    outcomes=("log1p_meme_volume", "mean_meme_sentiment", "log1p_meme_engagement"),
    lag_sets=([1, 2, 3, 4], [1, 2, 3], [1, 2], [1]),
    include_current: bool = True,
):
    results = {}
    for lags in lag_sets:
        models = {
            outcomes[0]: run_twfe_mention_only(df, outcome=outcomes[0], include_current=include_current, lags=lags),
            outcomes[1]: run_twfe_mention_only(df, outcome=outcomes[1], include_current=include_current, lags=lags),
            outcomes[2]: run_twfe_mention_only(df, outcome=outcomes[2], include_current=include_current, lags=lags),
        }
        rows = []
        for outcome_name, model in models.items():
            rows += collect_key_coefs(model, outcome_name)
        coefs = pd.DataFrame(rows).sort_values(["outcome", "predictor", "lag"]).reset_index(drop=True)
        suffix = f"{len(lags)}_lags"
        out_path = DERIVED / f"twfe_key_coefficients_mention_only_{suffix}.csv"
        coefs.to_csv(out_path, index=False)
        print(f"Saved mention-only key coefficients to: {out_path}")
        print(coefs.head(12).to_string(index=False))
        results[suffix] = coefs
    return results

# Run and export 
_ = export_mention_only_coefficients(panel)



=== Mention-only: log1p_meme_volume ===
N before drop: 8904  N after drop: 8568
Formula: log1p_meme_volume ~ NYT_mention + NYT_mention_L1 + NYT_mention_L2 + NYT_mention_L3 + NYT_mention_L4 + C(company) + C(week_fe)
                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.418
Model:                            OLS   Adj. R-squared:                  0.405
Method:                 Least Squares   F-statistic:                -1.073e+12
Date:                Wed, 12 Nov 2025   Prob (F-statistic):               1.00
Time:                        16:04:52   Log-Likelihood:                -1931.7
No. Observations:                8568   AIC:                             4243.
Df Residuals:                    8378   BIC:                             5584.
Df Model:                         189                                         
Covariance Type:              cluster                                    



                             OLS Regression Results                            
Dep. Variable:     mean_meme_sentiment   R-squared:                       0.263
Model:                             OLS   Adj. R-squared:                  0.101
Method:                  Least Squares   F-statistic:                 5.746e+11
Date:                 Wed, 12 Nov 2025   Prob (F-statistic):               0.00
Time:                         16:04:52   Log-Likelihood:                 566.23
No. Observations:                  971   AIC:                            -782.5
Df Residuals:                      796   BIC:                             71.25
Df Model:                          174                                         
Covariance Type:               cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Inte



                             Test for Constraints                             
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.0076      0.002     -4.992      0.000      -0.011      -0.005
Saved mention-only key coefficients to: /Users/beszabo/bene/szakdolgozat/data/derived/twfe_key_coefficients_mention_only_4_lags.csv
              outcome   predictor  lag      coef       se     pval
log1p_meme_engagement NYT_mention    0 -0.003449 0.001607 0.031898
log1p_meme_engagement NYT_mention    1  0.001239 0.001508 0.411360
log1p_meme_engagement NYT_mention    2  0.000875 0.001898 0.644705
log1p_meme_engagement NYT_mention    3 -0.005575 0.001140 0.000001
log1p_meme_engagement NYT_mention    4 -0.004102 0.001596 0.010183
    log1p_meme_volume NYT_mention    0 -0.005674 0.001748 0.001168
    log1p_meme_volume NYT_mention    1  0.000282 0.001181 0.811288
    log1p_meme_v



                             OLS Regression Results                            
Dep. Variable:     mean_meme_sentiment   R-squared:                       0.262
Model:                             OLS   Adj. R-squared:                  0.101
Method:                  Least Squares   F-statistic:                     673.9
Date:                 Wed, 12 Nov 2025   Prob (F-statistic):           4.38e-78
Time:                         16:04:52   Log-Likelihood:                 564.13
No. Observations:                  973   AIC:                            -778.3
Df Residuals:                      798   BIC:                             75.82
Df Model:                          174                                         
Covariance Type:               cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Inte



                              OLS Regression Results                             
Dep. Variable:     log1p_meme_engagement   R-squared:                       0.317
Model:                               OLS   Adj. R-squared:                  0.302
Method:                    Least Squares   F-statistic:                 1.443e+05
Date:                   Wed, 12 Nov 2025   Prob (F-statistic):          3.25e-191
Time:                           16:04:52   Log-Likelihood:                -1830.6
No. Observations:                   8652   AIC:                             4041.
Df Residuals:                       8462   BIC:                             5384.
Df Model:                            189                                         
Covariance Type:                 cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------



                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.415
Model:                            OLS   Adj. R-squared:                  0.403
Method:                 Least Squares   F-statistic:                -7.109e+11
Date:                Wed, 12 Nov 2025   Prob (F-statistic):               1.00
Time:                        16:04:53   Log-Likelihood:                -1924.4
No. Observations:                8736   AIC:                             4229.
Df Residuals:                    8546   BIC:                             5573.
Df Model:                         189                                         
Covariance Type:              cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     



                              OLS Regression Results                             
Dep. Variable:     log1p_meme_engagement   R-squared:                       0.317
Model:                               OLS   Adj. R-squared:                  0.302
Method:                    Least Squares   F-statistic:                 4.215e+04
Date:                   Wed, 12 Nov 2025   Prob (F-statistic):          4.97e-169
Time:                           16:04:53   Log-Likelihood:                -1834.3
No. Observations:                   8736   AIC:                             4049.
Df Residuals:                       8546   BIC:                             5393.
Df Model:                            189                                         
Covariance Type:                 cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------



                            OLS Regression Results                            
Dep. Variable:      log1p_meme_volume   R-squared:                       0.415
Model:                            OLS   Adj. R-squared:                  0.402
Method:                 Least Squares   F-statistic:                 5.385e+10
Date:                Wed, 12 Nov 2025   Prob (F-statistic):               0.00
Time:                        16:04:54   Log-Likelihood:                -1919.8
No. Observations:                8820   AIC:                             4220.
Df Residuals:                    8630   BIC:                             5566.
Df Model:                         189                                         
Covariance Type:              cluster                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     



In [None]:
for lags in ([1], [1,2], [1,2,3], [1,2,3,4]):
    need = [outcome, 'company', 'week_fe']             \
         + ['NYT_mention'] + [f'NYT_mention_L{k}' for k in lags] \
         + ['nyt_sentiment'] + [f'nyt_sentiment_L{k}' for k in lags]
    print(lags, df.dropna(subset=need).shape[0])
