In [None]:
# Stage 4 (sentence-level): stance by year with VADER
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize

# ---------------- Paths ----------------
INPUT_XLSX = "articles_processed.xlsx"          # must contain Year + text
OUT_DIR = "outputs_wired_text"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_XLSX         = os.path.join(OUT_DIR, "stage4_sentence_level_stance_by_year.xlsx")
OUT_PNG_COUNTS   = os.path.join(OUT_DIR, "stage4_sentence_level_counts.png")
OUT_PNG_SHARES   = os.path.join(OUT_DIR, "stage4_sentence_level_shares.png")
OUT_PNG_STRONG   = os.path.join(OUT_DIR, "stage4_sentence_level_strongshare_mean.png")

# ---------------- Load -----------------
df = pd.read_excel(INPUT_XLSX)

# pick text column
TEXT_CANDIDATES = ["text_cleaned_final", "text"]
text_col = next((c for c in TEXT_CANDIDATES if c in df.columns), None)
if text_col is None:
    raise ValueError("No text column found. Need 'text_cleaned_final' or 'text' in articles_processed.xlsx.")

# ensure Year exists and is numeric
if "Year" not in df.columns:
    raise ValueError("Input must contain a 'Year' column (run Stage 1 first).")
df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
df = df[df["Year"].notna()].copy()

# ---------------- NLTK resources ----------
for pkg, path in [("vader_lexicon","sentiment/vader_lexicon.zip"),
                  ("punkt","tokenizers/punkt")]:
    try:
        nltk.data.find(path)
    except LookupError:
        nltk.download(pkg)

sia = SentimentIntensityAnalyzer()

def clean_spaces(s: str) -> str:
    s = re.sub(r"\s+", " ", str(s)).strip()
    return s

def sentence_level_scores(text: str,
                          strong_thresh: float = 0.5) -> dict:
    """
    Return per-article aggregates derived from sentence polarity.
    - sent_mean: length-weighted mean compound over sentences
    - strong_share: share of strong polarity sentences (|compound|>=strong_thresh)
    - n_sent: #sentences used
    """
    txt = clean_spaces(text)
    if not txt:
        return {"sent_mean": np.nan, "strong_share": np.nan, "n_sent": 0}

    sents = [s.strip() for s in sent_tokenize(txt) if s.strip()]
    if not sents:
        return {"sent_mean": np.nan, "strong_share": np.nan, "n_sent": 0}

    comps, lengths, strong_flags = [], [], []
    for s in sents:
        comp = sia.polarity_scores(s)["compound"]
        # token length as weight (min 1)
        L = max(1, len(word_tokenize(s)))
        comps.append(comp); lengths.append(L)
        strong_flags.append(1 if abs(comp) >= strong_thresh else 0)

    weights = np.array(lengths, dtype=float)
    weights = weights / weights.sum()
    sent_mean = float(np.dot(np.array(comps, dtype=float), weights))
    strong_share = float(np.mean(strong_flags))
    return {"sent_mean": sent_mean, "strong_share": strong_share, "n_sent": len(sents)}

# ---------------- Compute per-article aggregates ----------------
agg_rows = df[text_col].astype(str).apply(sentence_level_scores).apply(pd.Series)
df = pd.concat([df.reset_index(drop=True), agg_rows.reset_index(drop=True)], axis=1)

# Article stance from sentence-level mean
# thresholds can be tuned; these are standard for VADER:
NEG_T = -0.05
POS_T =  0.05
def to_stance(x):
    if pd.isna(x): return "neutral"
    if x > POS_T:  return "positive"
    if x < NEG_T:  return "negative"
    return "neutral"

df["stance"] = df["sent_mean"].apply(to_stance)

# ---------------- Yearly aggregation ----------------
# counts per stance
stance_counts = (
    df.groupby(["Year", "stance"])
      .size().reset_index(name="count")
      .pivot(index="Year", columns="stance", values="count")
      .fillna(0)
      .reset_index()
)

# shares per year (row-normalized)
stance_shares = stance_counts.copy()
row_sum = stance_shares.drop(columns=["Year"]).sum(axis=1).replace(0, 1)
for col in [c for c in stance_shares.columns if c != "Year"]:
    stance_shares[col] = stance_shares[col] / row_sum

# mean strong polarity share by year (how opinionated)
strong_share_year = (
    df.groupby("Year")["strong_share"]
      .mean()
      .reset_index()
      .rename(columns={"strong_share":"mean_strong_share"})
      .sort_values("Year")
)

# ---------------- Save Excel ----------------
with pd.ExcelWriter(OUT_XLSX, engine="xlsxwriter") as writer:
    df[["Year","sent_mean","strong_share","n_sent","stance"]].to_excel(writer, sheet_name="per_article", index=False)
    stance_counts.to_excel(writer, sheet_name="counts", index=False)
    stance_shares.to_excel(writer, sheet_name="shares", index=False)
    strong_share_year.to_excel(writer, sheet_name="strong_share_year", index=False)

print("Stage 4 (sentence-level) done ->", OUT_XLSX)

# ---------------- Plot: counts ----------------
plt.figure(figsize=(12, 6))
for col in [c for c in stance_counts.columns if c != "Year"]:
    plt.plot(stance_counts["Year"], stance_counts[col], marker="o", label=col)
plt.title("AI article stance by year (sentence-level, counts)")
plt.xlabel("Year"); plt.ylabel("Number of articles")
plt.legend(); plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(OUT_PNG_COUNTS, dpi=300); plt.close()

# ---------------- Plot: shares ----------------
plt.figure(figsize=(12, 6))
for col in [c for c in stance_shares.columns if c != "Year"]:
    plt.plot(stance_shares["Year"], stance_shares[col], marker="o", label=col)
plt.title("AI article stance by year (sentence-level, shares)")
plt.xlabel("Year"); plt.ylabel("Share of articles")
plt.legend(); plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(OUT_PNG_SHARES, dpi=300); plt.close()

# ---------------- Plot: opinion strength (mean strong_share) ----------------
plt.figure(figsize=(12, 5))
plt.plot(strong_share_year["Year"], strong_share_year["mean_strong_share"], marker="o")
plt.title("Mean share of strong-polarity sentences by year (|compound| ≥ 0.5)")
plt.xlabel("Year"); plt.ylabel("Mean strong sentence share")
plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(OUT_PNG_STRONG, dpi=300); plt.close()

print("PNGs saved ->")
print("  Counts:", OUT_PNG_COUNTS)
print("  Shares:", OUT_PNG_SHARES)
print("  Strong-share:", OUT_PNG_STRONG)


In [None]:
# Sentiment Sensitivity Analysis
"""
Sentiment Sensitivity Check (dual-rule)
--------------------------------------
Purpose:
- Given article-level sentiment metrics (sent_mean, strong_share),
  sweep neutral-band & strong-max parameters and report label shares
  and change rate vs a baseline. Optionally recompute metrics from text.

Inputs (choose one):
A) Preferred: Excel with an "article_level" sheet (from a prior run)
   containing columns: Year, sent_mean, strong_share
B) If A is not available: provide a text column and Year, and this script
   will recompute sent_mean & strong_share using VADER sentence scoring.

Outputs (in OUT_DIR):
- overall_label_shares_by_grid.xlsx / .csv
- label_change_rate_vs_baseline.xlsx / .csv
- shares_by_year_(grid).xlsx  (one sheet per parameter combo for year-by-year robustness checks)
- heatmap_overall_neutral_share.png
- heatmap_changed_vs_baseline.png
"""

# --- Requirements: pandas, numpy, matplotlib, openpyxl (Excel), nltk (vader_lexicon, punkt) ---

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
# Prefer using your previously exported article-level details (with sent_mean/strong_share)
INPUT_XLSX          = "outputs_wired_text_dualmetric/stance_dual_by_year.xlsx"
ARTICLE_SHEET_NAME  = "article_level"   # change to your sheet name

# If you need to recompute from text (when article_level lacks sent_mean/strong_share)
# Candidate column names (case-insensitive)
TEXT_CANDIDATES = ["text_cleaned_final", "text", "content", "article_text", "body"]
YEAR_CANDIDATES = ["Year", "year", "pub_year", "year_pub"]

# Parameter grids (modifiable)
NEUTRAL_BAND_GRID = [0.10, 0.15, 0.20]
STRONG_MAX_GRID   = [0.15, 0.20, 0.25]

# Baseline parameters (for change-rate calculation)
BASE_NEUTRAL_BAND = 0.15
BASE_STRONG_MAX   = 0.20

# Sentence-level "strong polarity" threshold (VADER convention)
STRONG_SENT_THRESH = 0.5

# Output directory
OUT_DIR = "sentiment_sensitivity_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Whether to export Excel files (CSV is always exported)
SAVE_EXCEL = True
# ===================================================


def pick_col(df, candidates, must=True, desc=""):
    cols_map = {str(c).strip().lower(): c for c in df.columns}
    for name in candidates:
        if name.lower() in cols_map:
            return cols_map[name.lower()]
    if must:
        raise KeyError(
            f"Missing required column for {desc}. "
            f"Tried {candidates}. Available: {list(df.columns)}"
        )
    return None


def try_load_article_level(path, sheet):
    """Prefer loading your previously exported article_level (contains sent_mean/strong_share)."""
    if not os.path.exists(path):
        return None
    try:
        df = pd.read_excel(path, sheet_name=sheet)
    except Exception:
        return None
    cols = [str(c) for c in df.columns]
    if not {"sent_mean", "strong_share"}.issubset(set(cols)):
        return None
    # Normalize Year
    year_col = pick_col(df, YEAR_CANDIDATES, must=True, desc="Year")
    df[year_col] = pd.to_numeric(df[year_col], errors="coerce")
    df = df.dropna(subset=[year_col]).copy()
    df.rename(columns={year_col: "Year"}, inplace=True)
    return df[["Year", "sent_mean", "strong_share"]].copy()


def recompute_from_text(path, sheet_or_none=None):
    """When article_level lacks sent_mean/strong_share, recompute from text."""
    import nltk
    from nltk.tokenize import sent_tokenize
    from nltk.sentiment import SentimentIntensityAnalyzer

    for res in ["vader_lexicon", "punkt"]:
        try:
            nltk.data.find(f"tokenizers/{res}" if res == "punkt" else f"sentiment/{res}")
        except LookupError:
            nltk.download(res)

    if sheet_or_none is None:
        df0 = pd.read_excel(path)
    else:
        df0 = pd.read_excel(path, sheet_name=sheet_or_none)

    text_col = pick_col(df0, TEXT_CANDIDATES, must=True, desc="text")
    year_col = pick_col(df0, YEAR_CANDIDATES, must=True, desc="year")

    df0[year_col] = pd.to_numeric(df0[year_col], errors="coerce")
    df0 = df0.dropna(subset=[year_col]).copy()
    df0.rename(columns={year_col: "Year"}, inplace=True)

    sia = SentimentIntensityAnalyzer()

    def sent_metrics(txt):
        t = str(txt).strip()
        if not t:
            return pd.Series({"sent_mean": np.nan, "strong_share": np.nan})
        sents = [s.strip() for s in sent_tokenize(t) if s.strip()]
        if not sents:
            return pd.Series({"sent_mean": np.nan, "strong_share": np.nan})
        scores = [sia.polarity_scores(s)["compound"] for s in sents]
        sent_mean = float(np.mean(scores))
        strong_share = float(np.mean([abs(v) >= STRONG_SENT_THRESH for v in scores]))
        return pd.Series({"sent_mean": sent_mean, "strong_share": strong_share})

    agg = df0[text_col].astype(str).apply(sent_metrics)
    df = pd.concat([df0[["Year"]].reset_index(drop=True), agg.reset_index(drop=True)], axis=1)
    return df


def dual_label(m, s, neutral_band, strong_max):
    if np.isnan(m) or np.isnan(s):
        return "unknown"
    if -neutral_band <= m <= neutral_band and s < strong_max:
        return "neutral"
    return "positive" if m > 0 else "negative"


def compute_tables(df, neutral_grid, strong_grid, base_nb, base_sm):
    """Produce two overall tables + a dict of year-by-year share tables for each parameter combo."""
    # baseline labels
    base_labels = df.apply(lambda r: dual_label(r["sent_mean"], r["strong_share"], base_nb, base_sm), axis=1)

    overall_rows = []
    change_rows = []
    by_year_tables = {}  # (nb, sm) -> DataFrame(year x label shares%)

    # for heatmaps
    neutral_share_map = np.zeros((len(neutral_grid), len(strong_grid)))
    change_rate_map = np.zeros((len(neutral_grid), len(strong_grid)))

    for i, nb in enumerate(neutral_grid):
        for j, sm in enumerate(strong_grid):
            labels = df.apply(lambda r: dual_label(r["sent_mean"], r["strong_share"], nb, sm), axis=1)

            # overall shares
            share = (labels.value_counts(normalize=True) * 100).reindex(
                ["positive", "neutral", "negative", "unknown"]
            ).fillna(0)

            overall_rows.append({
                "neutral_band": nb,
                "strong_max": sm,
                "positive_%": round(share["positive"], 2),
                "neutral_%": round(share["neutral"], 2),
                "negative_%": round(share["negative"], 2),
                "unknown_%": round(share["unknown"], 2),
            })

            # change vs baseline
            change_rate = float((labels != base_labels).mean()) * 100.0
            change_rows.append({
                "neutral_band": nb,
                "strong_max": sm,
                "changed_vs_baseline_%": round(change_rate, 2)
            })

            # by-year shares table
            tmp = pd.DataFrame({"Year": df["Year"].values, "label": labels.values})
            by_year = (tmp.groupby(["Year", "label"]).size().reset_index(name="count")
                         .pivot(index="Year", columns="label", values="count")
                         .fillna(0).sort_index())
            by_year_shares = by_year.div(by_year.sum(axis=1).replace(0, 1), axis=0) * 100
            by_year_tables[(nb, sm)] = by_year_shares.reindex(
                columns=["positive", "neutral", "negative", "unknown"]
            ).fillna(0)

            # heatmap values
            neutral_share_map[i, j] = share["neutral"]
            change_rate_map[i, j] = change_rate

    overall_table = pd.DataFrame(overall_rows).sort_values(["neutral_band", "strong_max"]).reset_index(drop=True)
    change_table = pd.DataFrame(change_rows).sort_values(["neutral_band", "strong_max"]).reset_index(drop=True)
    return overall_table, change_table, by_year_tables, neutral_share_map, change_rate_map


def plot_heatmap(matrix, x_labels, y_labels, title, out_png):
    plt.figure(figsize=(6, 4.5))
    plt.imshow(matrix, aspect="auto")
    plt.colorbar(label="value")
    plt.xticks(range(len(x_labels)), x_labels)
    plt.yticks(range(len(y_labels)), y_labels)
    plt.xlabel("STRONG_MAX")
    plt.ylabel("NEUTRAL_BAND")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close()


def main():
    # 1) Try loading article_level (with sent_mean/strong_share)
    df = try_load_article_level(INPUT_XLSX, ARTICLE_SHEET_NAME)
    if df is None:
        # 2) If not available, recompute from text (same file or another with a text column)
        df = recompute_from_text(INPUT_XLSX, ARTICLE_SHEET_NAME)

    # Clean Year
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df.dropna(subset=["Year"]).copy()

    # 3) Compute sensitivity tables
    shares, changes, by_year, neutral_map, change_map = compute_tables(
        df, NEUTRAL_BAND_GRID, STRONG_MAX_GRID, BASE_NEUTRAL_BAND, BASE_STRONG_MAX
    )

    # 4) Save tables
    shares_csv = os.path.join(OUT_DIR, "overall_label_shares_by_grid.csv")
    changes_csv = os.path.join(OUT_DIR, "label_change_rate_vs_baseline.csv")
    shares.to_csv(shares_csv, index=False)
    changes.to_csv(changes_csv, index=False)

    if SAVE_EXCEL:
        with pd.ExcelWriter(os.path.join(OUT_DIR, "overall_label_shares_by_grid.xlsx")) as w:
            shares.to_excel(w, sheet_name="overall_shares", index=False)
        with pd.ExcelWriter(os.path.join(OUT_DIR, "label_change_rate_vs_baseline.xlsx")) as w:
            changes.to_excel(w, sheet_name="changes_vs_baseline", index=False)
        # One “by year” sheet per parameter combo—handy for citing specific grid-year curves
        by_year_book = os.path.join(OUT_DIR, "shares_by_year_all_grids.xlsx")
        with pd.ExcelWriter(by_year_book) as w:
            for (nb, sm), tab in by_year.items():
                sheet = f"nb_{str(nb).replace('.','_')}_sm_{str(sm).replace('.','_')}"
                tab.reset_index().to_excel(w, sheet_name=sheet[:31], index=False)

    # 5) Heatmaps (overview)
    plot_heatmap(
        neutral_map, STRONG_MAX_GRID, NEUTRAL_BAND_GRID,
        "Neutral share (%) by parameter grid",
        os.path.join(OUT_DIR, "heatmap_overall_neutral_share.png")
    )
    plot_heatmap(
        change_map, STRONG_MAX_GRID, NEUTRAL_BAND_GRID,
        "Label change rate (%) vs baseline",
        os.path.join(OUT_DIR, "heatmap_changed_vs_baseline.png")
    )

    # 6) Console preview
    print("\n=== Overall shares by parameter grid ===")
    print(shares.to_string(index=False))
    print("\n=== Change rate vs baseline (%) ===")
    print(changes.to_string(index=False))
    print(f"\nOutputs saved in: {OUT_DIR}")


if __name__ == "__main__":
    main()
