# Framing Analysis for Debates and Media Outlets

## 1. Introduction and Config

In [1]:
# === LIBRARIES ===

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import tqdm

In [4]:
# === PATHS & CONSTANTS ===

REPO_DIR   = Path(".").resolve().parents[0]
DATA_DIR   = REPO_DIR / "data"

DEBATES_FILE = DATA_DIR / "debates_df_themes.csv"
MEDIA_FILE  = DATA_DIR / "important" / "media_chunks_pred_balanced.csv"

# confirm setup
print("Repository Path:", REPO_DIR)
print("Data Directory:", DATA_DIR)
print("Debates Dataset:", DEBATES_FILE)
print("Media Dataset:", MEDIA_FILE)

# reproducibility
RANDOM_SEED = 42

Repository Path: /Users/emmamora/Documents/GitHub/thesis
Data Directory: /Users/emmamora/Documents/GitHub/thesis/data
Debates Dataset: /Users/emmamora/Documents/GitHub/thesis/data/debates_df_themes.csv
Media Dataset: /Users/emmamora/Documents/GitHub/thesis/data/important/media_chunks_pred_balanced.csv


## 2. Load Data

In [5]:
# === DEBATES DATASET (whole data) ===

# load debates dataset and inspect schema
df_debates = pd.read_csv(DEBATES_FILE)

print(f"[INFO] Debates dataset (full) loaded: {len(df_debates):,} rows")
print(df_debates.head(3))
print("\n[INFO] Columns:", list(df_debates.columns))

[INFO] Debates dataset (full) loaded: 6,316 rows
                                                text speaker_normalized  \
0  good evening. the television and radio station...          Moderator   
1  mr. smith, mr. nixon. in the election of 1860,...        Candidate_D   
2  mr. smith, senator kennedy. the things that se...        Candidate_R   

     speaker       party   winner winner_party  year   debate_type  \
0  Moderator         NaN  Kennedy     Democrat  1960  presidential   
1    Kennedy    Democrat  Kennedy     Democrat  1960  presidential   
2      Nixon  Republican  Kennedy     Democrat  1960  presidential   

                           debate_id                           utterance_id  \
0  1960_1_Presidential_Nixon_Kennedy  1960_1_Presidential_Nixon_Kennedy_001   
1  1960_1_Presidential_Nixon_Kennedy  1960_1_Presidential_Nixon_Kennedy_002   
2  1960_1_Presidential_Nixon_Kennedy  1960_1_Presidential_Nixon_Kennedy_004   

                                     lemmatized_text

In [6]:
# === MEDIA DATASET (balanced per outlet) ===

# load media dataset and inspect schema
df_media = pd.read_csv(MEDIA_FILE)

print(f"[INFO] Media dataset (balanced) loaded: {len(df_media):,} rows")
print(df_media.head(3))
print("\n[INFO] Columns:", list(df_media.columns))

[INFO] Media dataset (balanced) loaded: 675 rows
   __media_id  year outlet                      source_theme  chunk_index  \
0         440  2016    nyp          healthcare_public_health            2   
1         444  2016    nyp          healthcare_public_health            1   
2         278  2024    nyp  foreign_policy_national_security            7   

                                          chunk_text  \
0  stone said. jonathan gruber, the mit professor...   
1  "we're thinking of having him in the spin room...   
2  iowa caucuses. trump was right when he accused...   

                   pred_theme  pred_sim                second_theme  \
0  healthcare_social_security  0.724106     public_health_pandemics   
1  healthcare_social_security  0.717676     public_health_pandemics   
2     judiciary_supreme_court  0.747351  healthcare_social_security   

   second_sim    margin                                      chunk_preview  \
0    0.448441  0.275665  stone said. jonathan gruber, 

## 3. Frame Taxonomy & Model Setup

In [7]:
# === FRAME TAXONOMY (CANONICAL LABELS + DEFINITIONS) ===
# comments: use short noun-phrase labels; keep a single hypothesis template for consistency

FRAME_LABELS = [
    "economic consequences",
    "legality and constitutionality",
    "security and safety",
    "morality and ethics",
    "fairness and social justice",
]

# definitions for documentation / exports (not fed to the model)
FRAME_DEFINITIONS = {
    "economic consequences": "costs/benefits, jobs, taxes, budgets, growth, trade, material prosperity.",
    "legality and constitutionality": "law, rights, courts, the Constitution, legality/illegality of actions.",
    "security and safety": "national security, defense, crime, policing, terrorism, public health safety.",
    "morality and ethics": "religious/ethical values, right vs wrong, virtue/vice, cultural norms.",
    "fairness and social justice": "equality, discrimination, civil rights, representation, group justice.",
}

# single, consistent hypothesis template (MNLI-style)
HYPOTHESIS_TEMPLATE = "This text frames the issue in terms of {}."

print("[INFO] Frames:", FRAME_LABELS)
print("[INFO] Hypothesis template:", HYPOTHESIS_TEMPLATE)

[INFO] Frames: ['economic consequences', 'legality and constitutionality', 'security and safety', 'morality and ethics', 'fairness and social justice']
[INFO] Hypothesis template: This text frames the issue in terms of {}.


In [9]:
# === DOMAIN-TUNED VERBALIZERS (US POLITICS CONTEXT) ===
# comments: add high-signal synonyms per frame, tailored for debates/media

FRAME_VERBALIZERS = {
    "economic consequences": [
        "economic consequences", "jobs and growth", "inflation", "taxation", 
        "deficits and budgets", "trade deals", "manufacturing", "economic prosperity"
    ],
    "legality and constitutionality": [
        "legality and constitutionality", "Supreme Court", "constitutional rights", 
        "rule of law", "legal compliance", "due process", "2nd Amendment", "judicial oversight"
    ],
    "security and safety": [
        "security and safety", "national security", "border security", 
        "law and order", "crime and policing", "terrorism threats", "military defense", "public health safety"
    ],
    "morality and ethics": [
        "morality and ethics", "religious and moral values", "faith", "family values", 
        "moral duty", "virtue and vice", "pro-life and pro-choice", "ethical principles"
    ],
    "fairness and social justice": [
        "fairness and social justice", "civil rights", "racial justice", "gender equality", 
        "equity and inclusion", "discrimination", "minority justice", "equal opportunity"
    ],
}

VERBALIZER_CANDIDATES = [v for vs in FRAME_VERBALIZERS.values() for v in vs]
print(f"[INFO] Updated verbalizer candidates: {len(VERBALIZER_CANDIDATES)} strings across {len(FRAME_VERBALIZERS)} frames")

[INFO] Updated verbalizer candidates: 40 strings across 5 frames


In [10]:
# === MODEL SETUP ===
# bart-large-mnli is fast; deberta-v3-large-zeroshot-v2 is stronger but slower

import torch
from transformers import pipeline

MODEL_NAME = "facebook/bart-large-mnli"  # alt: "MoritzLaurer/deberta-v3-large-zeroshot-v2"

classifier = pipeline(
    task="zero-shot-classification",
    model=MODEL_NAME,
    device=0 if torch.cuda.is_available() else -1,
)

print(f"[INFO] Zero-shot classifier loaded: {MODEL_NAME}")

  from .autonotebook import tqdm as notebook_tqdm


[INFO] Zero-shot classifier loaded: facebook/bart-large-mnli


In [11]:
# === SIMPLE CLASSIFICATION (CANONICAL LABELS) ===
# single pass using canonical labels + one hypothesis template for consistency

from typing import List, Dict
import pandas as pd
from tqdm.auto import tqdm

def zeroshot_classify_simple(
    texts: List[str],
    labels: List[str] = FRAME_LABELS,
    hypothesis_template: str = HYPOTHESIS_TEMPLATE,
    batch_size: int = 32,
) -> pd.DataFrame:
    rows = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Classifying (simple)"):
        batch = texts[i:i+batch_size]
        res = classifier(
            batch,
            candidate_labels=labels,
            hypothesis_template=hypothesis_template,
            multi_label=False,
        )
        if isinstance(res, dict):  # edge case: single text
            res = [res]
        for r in res:
            # sort by score desc
            pairs = list(zip(r["labels"], r["scores"]))
            pairs.sort(key=lambda x: x[1], reverse=True)
            best, best_score = pairs[0]
            second, second_score = pairs[1]
            rows.append({
                "best_frame": best,
                "best_score": float(best_score),
                "second_frame": second,
                "second_score": float(second_score),
            })
    return pd.DataFrame(rows)

In [12]:
# === VERBALIZER-AGGREGATED CLASSIFICATION (ROBUST) ===
# one pass over all verbalizer strings; aggregate per-frame by max or mean

from collections import defaultdict

def _aggregate_scores_by_frame(
    labels: List[str],
    scores: List[float],
    frame_verbalizers: Dict[str, List[str]],
    agg: str = "max",  # or "mean"
) -> Dict[str, float]:
    # map label->score
    score_map = {lab: sc for lab, sc in zip(labels, scores)}
    frame_scores = {}
    for frame, vers in frame_verbalizers.items():
        vals = [score_map.get(v, 0.0) for v in vers]
        if agg == "mean":
            frame_scores[frame] = float(sum(vals) / max(len(vals), 1))
        else:  # max
            frame_scores[frame] = float(max(vals) if vals else 0.0)
    return frame_scores

def zeroshot_classify_verbalizers(
    texts: List[str],
    frame_verbalizers: Dict[str, List[str]] = FRAME_VERBALIZERS,
    hypothesis_template: str = HYPOTHESIS_TEMPLATE,
    batch_size: int = 32,
    agg: str = "max",
) -> pd.DataFrame:
    candidates = [v for vs in frame_verbalizers.values() for v in vs]
    rows = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Classifying (verbalizers)"):
        batch = texts[i:i+batch_size]
        res = classifier(
            batch,
            candidate_labels=candidates,
            hypothesis_template=hypothesis_template,
            multi_label=False,
        )
        if isinstance(res, dict):
            res = [res]
        for r in res:
            frame_scores = _aggregate_scores_by_frame(r["labels"], r["scores"], frame_verbalizers, agg=agg)
            # rank frames
            ranked = sorted(frame_scores.items(), key=lambda x: x[1], reverse=True)
            (best, best_score), (second, second_score) = ranked[0], ranked[1]
            rows.append({
                "best_frame": best,
                "best_score": float(best_score),
                "second_frame": second,
                "second_score": float(second_score),
            })
    return pd.DataFrame(rows)

## 4. Apply to Datasets

In [13]:
# === DRY RUN SAMPLE ===
# comments: test pipeline on a small random sample to check outputs and distribution

# sample a few debate utterances and media chunks
sample_debates = df_debates["text"].dropna().sample(30, random_state=RANDOM_SEED).tolist()
sample_media = df_media["chunk_text"].dropna().sample(15, random_state=RANDOM_SEED).tolist()

# run classification on sample
sample_results = zeroshot_classify_verbalizers(sample_debates[:15])
print("[INFO] Sample debate frames:\n", sample_results["best_frame"].value_counts(), "\n")

media_results = zeroshot_classify_verbalizers(sample_media[:10])
print("[INFO] Sample media frames:\n", media_results["best_frame"].value_counts())

Classifying (verbalizers): 100%|██████████| 1/1 [04:59<00:00, 299.62s/it]


[INFO] Sample debate frames:
 best_frame
morality and ethics            7
economic consequences          4
security and safety            2
fairness and social justice    2
Name: count, dtype: int64 



Classifying (verbalizers): 100%|██████████| 1/1 [04:10<00:00, 250.50s/it]

[INFO] Sample media frames:
 best_frame
economic consequences    4
morality and ethics      4
security and safety      2
Name: count, dtype: int64





In [14]:
print("CUDA available:", torch.cuda.is_available())
print("MPS available:", torch.backends.mps.is_available())
print("Using device:", classifier.device)

CUDA available: False
MPS available: True
Using device: cpu


In [16]:
classification_device = "GPU" if torch.cuda.is_available() else "MPS" if torch.backends.mps.is_available() else "CPU"
print(f"[INFO] Classification will run on {classification_device}")

[INFO] Classification will run on MPS


### 4.1. Debates

In [17]:
# === CLASSIFY FULL DEBATES DATASET (WITH LIVE DISTRIBUTIONS) ===
# classify debates in chunks, save incrementally, and print frame counts per batch

FRAMES_DIR = DATA_DIR / "frames"
FRAMES_DIR.mkdir(parents=True, exist_ok=True)

debate_texts = df_debates["text"].dropna().tolist()

all_batches = []
BATCH_SIZE = 32
SAVE_CHUNK = 1000  # utterances per partial save

for i in tqdm(range(0, len(debate_texts), SAVE_CHUNK), desc="Debates (full run)"):
    batch_texts = debate_texts[i:i+SAVE_CHUNK]
    
    # classify this chunk
    batch_frames = zeroshot_classify_verbalizers(batch_texts, batch_size=BATCH_SIZE)
    
    # print distribution of predicted frames for this batch
    print(f"\n[INFO] Batch {i//SAVE_CHUNK} ({len(batch_frames)} utterances)")
    print(batch_frames["best_frame"].value_counts())
    
    # save temporary chunk file
    temp_path = DATA_DIR / f"debates_frames_part_{i//SAVE_CHUNK}.csv"
    batch_frames.to_csv(temp_path, index=False)
    print(f"[INFO] Saved partial results to {temp_path}")
    
    all_batches.append(batch_frames)

# merge all chunks into one dataframe
debate_frames_full = pd.concat(all_batches, axis=0).reset_index(drop=True)

# merge with original debates df
df_debates_frames = pd.concat([df_debates.reset_index(drop=True), debate_frames_full], axis=1)

print("\n[INFO] Debate framing classification complete:", df_debates_frames.shape)
print(df_debates_frames[["text", "best_frame", "best_score"]].head())

Classifying (verbalizers):   0%|          | 0/32 [09:18<?, ?it/s]
Debates (full run):   0%|          | 0/7 [09:18<?, ?it/s]


KeyboardInterrupt: 

### 4.2. Media Outlets

## 5. Quality Control & Filtering

## 6. Exports