In [1]:
!pip install transformers torch sentencepiece beautifulsoup4 requests




In [88]:
# =========================================================
# 1. INSTALL DEPENDENCIES (run once)
# =========================================================
!pip install transformers torch sentencepiece beautifulsoup4 requests

# =========================================================
# 2. IMPORTS
# =========================================================
import json
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import copy

# =========================================================
# 3. LOAD PLURIVERSE JSON SCHEMA
# =========================================================
SCHEMA_PATH = "Pluriverse_news_data.json"

with open(SCHEMA_PATH, "r") as f:
    PLURIVERSE_SCHEMA = json.load(f)

# =========================================================
# 4. LOAD TRANSFORMER MODELS
# =========================================================
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)

# =========================================================
# 5. SCRAPE ARTICLE FROM URL
# =========================================================
def scrape_article(url):
    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text() for p in soup.find_all("p")]
    return " ".join(paragraphs)

# =========================================================
# 6. NLP HELPER FUNCTIONS
# =========================================================
def clean_article_text(text):
    BLOCKED_PHRASES = [
        "enable javascript",
        "disable any ad blocker",
        "we noticed you are using",
        "subscribe to continue",
        "sign up to read",
        "cookies are required"
    ]

    cleaned_lines = []
    for line in text.split("\n"):
        line = line.strip()
        if len(line) < 40:
            continue
        if any(bp in line.lower() for bp in BLOCKED_PHRASES):
            continue
        cleaned_lines.append(line)

    return " ".join(cleaned_lines)


def summarize_text(text, max_len=60):
    if not text or len(text.split()) < 40:
        return "insufficient textual signal"

    try:
        summary = summarizer(
            text[:3000],
            max_length=max_len,
            min_length=20,
            do_sample=False
        )
        return summary[0]["summary_text"]
    except Exception:
        return "summary unavailable"


def prompted_summary(prompt, text, max_len=60):
    combined = f"{prompt}\n\n{text}"
    summary = summarizer(
        combined[:3000],
        max_length=max_len,
        min_length=20,
        do_sample=False
    )
    return summary[0]["summary_text"]

def infer_label(text, candidate_labels, fallback="unspecified"):
    if not text or len(text.strip()) < 20:
        return fallback

    try:
        result = classifier(text[:2000], candidate_labels)
        return result["labels"][0]
    except Exception:
        return fallback


# =========================================================
# 7. POPULATE PLURIVERSE JSON
# =========================================================
def populate_pluriverse_json(article_text):
    output = copy.deepcopy(PLURIVERSE_SCHEMA)

    # ---------- IDENTITY ----------
    output["identity"]["name"] = infer_label(
        article_text,
        ["citizens", "indigenous peoples", "workers", "scientists", "governments", "businesses"]
    )
    output["identity"]["organization"] = infer_label(
        article_text,
        ["government", "NGO", "corporation", "community group", "international body"]
    )

    # ---------- INTERSECTION ----------
    output["intersection"]["expression"] = prompted_summary(
        "Describe how the situation is currently framed or expressed in the article.",
        article_text,
        max_len=60
    )
    output["intersection"]["body_experience"] = infer_label(
        article_text,
        ["health impact", "environmental exposure", "labor stress", "mental strain"]
    )
    output["intersection"]["intention"] = infer_label(
        article_text,
        ["adaptation", "resistance", "innovation", "reform"]
    )

    # ---------- DEMOGRAPHIC ----------
    output["demographic"]["race"] = infer_label(
        article_text,
        ["indigenous", "minority", "global south", "global north", "global east", "global west", "unspecified"]
    )
    output["demographic"]["gender"] = infer_label(
        article_text,
        ["women", "men", "all genders", "unspecified"]
    )
    output["demographic"]["education"] = infer_label(
        article_text,
        ["formal education", "technical expertise", "traditional knowledge"]
    )
    output["demographic"]["occupation"] = infer_label(
        article_text,
        ["farmers", "workers", "scientists", "policy makers", "community leaders"]
    )
    output["demographic"]["ability"] = infer_label(
        article_text,
        ["disabled", "unspecified", "able"]
    )
    output["demographic"]["wealth"] = infer_label(
        article_text,
        ["low income", "middle income", "elite", "mixed"]
    )
    output["demographic"]["health"] = infer_label(
        article_text,
        ["vulnerable", "at risk", "stable", "protected"]
    )

    # ---------- INTENTION ----------
    output["intention"]["autonomy"] = infer_label(
        article_text,
        ["self-determined", "externally imposed"]
    )
    output["intention"]["design"] = infer_label(
        article_text,
        ["top-down", "bottom-up", "co-created"]
    )
    output["intention"]["future"] = prompted_summary(
        "Summarize what future outcomes, scenarios, or trajectories are suggested.",
        article_text,
        max_len=100
    )
    output["intention"]["new"] = infer_label(
        article_text,
        ["technological innovation", "social innovation", "policy reform"]
    )
    output["intention"]["symbol"] = infer_label(
        article_text,
        ["progress", "resistance", "healing", "regeneration"]
    )
    output["intention"]["impact"] = infer_label(
        article_text,
        ["systemic change", "localized change", "symbolic change"]
    )
    output["intention"]["support"] = infer_label(
        article_text,
        ["public support", "institutional support", "grassroots support"]
    )

    # ---------- POLICY CRITIQUE ----------
    output["policy_critique"]["origin"] = infer_label(
        article_text,
        ["colonial", "neoliberal", "state-led", "community-led"]
    )
    output["policy_critique"]["leak"] = infer_label(
        article_text,
        ["regulatory failure", "corruption", "resource extraction"]
    )
    output["policy_critique"]["depletion"] = infer_label(
        article_text,
        ["ecological depletion", "social depletion", "economic depletion"]
    )
    output["policy_critique"]["obstacles"] = infer_label(
        article_text,
        ["political resistance", "economic constraints", "cultural barriers"]
    )

    # ---------- FUTURE ----------
    output["future"]["ritual"] = infer_label(
        article_text,
        ["restoration", "remembrance", "adaptation", "innovation"]
    )
    output["future"]["hope"] = prompted_summary(
        "Summarize what the article implies people hope for, value, or aspire to.",
        article_text,
        max_len=60
    )
    output["future"]["depict"] = infer_label(
        article_text,
        ["utopian", "dystopian", "transitional", "resilient"]
    )

    return output

# =========================================================
# 8. END-TO-END FUNCTION (DASHBOARD INPUT â†’ JSON OUTPUT)
# =========================================================
def analyze_article_to_pluriverse_json(article_url):
    raw_text = scrape_article(article_url)
    article_text = clean_article_text(raw_text)

    if not article_text or len(article_text) < 100:
        raise ValueError("Article text could not be reliably extracted.")

    populated_json = populate_pluriverse_json(article_text)
    return populated_json


# =========================================================
# 9. RUN EXAMPLE
# =========================================================
ARTICLE_URL = "https://spectator.org/how-did-summers-vetting-repeatedly-miss-epstein-ties/"

result = analyze_article_to_pluriverse_json(ARTICLE_URL)

print(json.dumps(result, indent=2))




Device set to use mps:0
Device set to use mps:0


{
  "identity": {
    "name": "governments",
    "organization": "community group"
  },
  "intersection": {
    "expression": "Larry Summers\u2019s involvement with Jeffrey Epstein goes back to at least 1998. He maintained his contact with Epstein until July 5, 2019, the day before his arrest. The question is: Did everyone miss the connection and its seriousness, or did they choose to ignore it?",
    "body_experience": "health impact",
    "intention": "adaptation"
  },
  "demographic": {
    "race": "minority",
    "gender": "unspecified",
    "education": "traditional knowledge",
    "occupation": "policy makers",
    "ability": "able",
    "wealth": "mixed",
    "health": "at risk"
  },
  "intention": {
    "autonomy": "externally imposed",
    "design": "co-created",
    "future": "Larry Summers\u2019s involvement with Jeffrey Epstein goes back to at least 1998. He maintained his contact with Epstein until July 5, 2019, the day before his arrest. The question is: Did everyone miss

In [94]:
# =========================================================
# 1. INSTALL DEPENDENCIES (run once)
# =========================================================
!pip install transformers torch sentencepiece

# =========================================================
# 2. IMPORTS
# =========================================================
import json
from statistics import mean
from transformers import pipeline
from collections import Counter

# =========================================================
# 3. FILE PATHS
# =========================================================
INPUT_PATH = "Epstein_Pluriverse_news_data.json"
OUTPUT_PATH = "Epstein_Pluriverse_avg_data.json"

# =========================================================
# 4. LOAD DATA
# =========================================================
with open(INPUT_PATH, "r") as f:
    articles = json.load(f)

print(f"Loaded {len(articles)} Pluriverse article entries")

# =========================================================
# 5. LOAD SUMMARIZER
# =========================================================
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)

# =========================================================
# 6. HELPER FUNCTIONS
# =========================================================
def most_common(values, fallback="unspecified"):
    values = [v for v in values if isinstance(v, str) and v.strip()]
    return Counter(values).most_common(1)[0][0] if values else fallback


def synthesize_text(texts, max_len=60):
    texts = [t for t in texts if isinstance(t, str) and len(t.strip()) > 30]
    if not texts:
        return "insufficient signal"

    combined = " ".join(texts)[:3500]

    try:
        summary = summarizer(
            combined,
            max_length=max_len,
            min_length=25,
            do_sample=False
        )
        return summary[0]["summary_text"]
    except Exception:
        return texts[0]

# =========================================================
# 7. AGGREGATE PLURIVERSE ENTRIES (MATCHING YOUR SCHEMA)
# =========================================================
def aggregate_pluriverse(entries):
    unified = {
        "identity": {},
        "intersection": {},
        "demographic": {},
        "intention": {},
        "policy_critique": {},
        "future": {},
        "meta": {
            "country": "",
            "article_count": len(entries),
            "method": "most-common categorical + synthesized narrative"
        }
    }

    # ---------------- IDENTITY ----------------
    unified["identity"]["url"] = "multiple sources"
    unified["identity"]["source"] = most_common(
        [e.get("identity", {}).get("source") for e in entries]
    )
    unified["identity"]["name"] = most_common(
        [e.get("identity", {}).get("name") for e in entries]
    )
    unified["identity"]["organization"] = most_common(
        [e.get("identity", {}).get("organization") for e in entries]
    )

    # ---------------- INTERSECTION ----------------
    unified["intersection"]["expression"] = synthesize_text(
        [e.get("intersection", {}).get("expression") for e in entries],
        max_len=45
    )
    unified["intersection"]["body_experience"] = most_common(
        [e.get("intersection", {}).get("body_experience") for e in entries]
    )
    unified["intersection"]["intention"] = most_common(
        [e.get("intersection", {}).get("intention") for e in entries]
    )

    # ---------------- DEMOGRAPHIC ----------------
    for field in [
        "race", "gender", "education", "occupation",
        "ability", "wealth", "health"
    ]:
        unified["demographic"][field] = most_common(
            [e.get("demographic", {}).get(field) for e in entries]
        )

    # ---------------- INTENTION ----------------
    unified["intention"]["autonomy"] = most_common(
        [e.get("intention", {}).get("autonomy") for e in entries]
    )
    unified["intention"]["design"] = most_common(
        [e.get("intention", {}).get("design") for e in entries]
    )
    unified["intention"]["future"] = synthesize_text(
        [e.get("intention", {}).get("future") for e in entries],
        max_len=40
    )
    unified["intention"]["new"] = most_common(
        [e.get("intention", {}).get("new") for e in entries]
    )
    unified["intention"]["symbol"] = most_common(
        [e.get("intention", {}).get("symbol") for e in entries]
    )
    unified["intention"]["impact"] = most_common(
        [e.get("intention", {}).get("impact") for e in entries]
    )
    unified["intention"]["support"] = most_common(
        [e.get("intention", {}).get("support") for e in entries]
    )

    # ---------------- POLICY CRITIQUE ----------------
    for field in ["origin", "leak", "depletion", "obstacles"]:
        unified["policy_critique"][field] = most_common(
            [e.get("policy_critique", {}).get(field) for e in entries]
        )

    # ---------------- FUTURE ----------------
    unified["future"]["ritual"] = most_common(
        [e.get("future", {}).get("ritual") for e in entries]
    )
    unified["future"]["hope"] = synthesize_text(
        [e.get("future", {}).get("hope") for e in entries],
        max_len=35
    )
    unified["future"]["depict"] = most_common(
        [e.get("future", {}).get("depict") for e in entries]
    )

    return unified

# =========================================================
# 8. RUN AGGREGATION
# =========================================================
unified_pluriverse = aggregate_pluriverse(articles)

# =========================================================
# 9. SAVE OUTPUT
# =========================================================
with open(OUTPUT_PATH, "w") as f:
    json.dump(unified_pluriverse, f, indent=2)

print("Unified Pluriverse JSON saved to:")
print(OUTPUT_PATH)

# =========================================================
# 10. DISPLAY RESULT
# =========================================================
print(json.dumps(unified_pluriverse, indent=2))


Loaded 15 Pluriverse article entries


Device set to use mps:0


Unified Pluriverse JSON saved to:
Epstein_Pluriverse_avg_data.json
{
  "identity": {
    "url": "multiple sources",
    "source": "left",
    "name": "citizens",
    "organization": "community group"
  },
  "intersection": {
    "expression": "New York Times columnist David Brooks attended a dinner with Jeffrey Epstein in 2011. TikTok officials say they are looking into why many users have been unable to send the word \"Epstein\" in direct messages. California",
    "body_experience": "health impact",
    "intention": "resistance"
  },
  "demographic": {
    "race": "minority",
    "gender": "unspecified",
    "education": "traditional knowledge",
    "occupation": "policy makers",
    "ability": "able",
    "wealth": "mixed",
    "health": "at risk"
  },
  "intention": {
    "autonomy": "externally imposed",
    "design": "co-created",
    "future": "New York Times columnist David Brooks attended a dinner with Jeffrey Epstein in 2011. The day before the fatal shooting of Alex Pretti, 