In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

# =============================
# STEP 1: Load Model & Reference Data
# =============================

# Load reference knowledge base
with open("reference_data.json", "r") as f:
    reference_data = json.load(f)

# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode reference sentences
ref_sentences = [item["sentence"] for item in reference_data]
ref_embeddings = model.encode(ref_sentences)

# =============================
# STEP 2: Define Helper Functions
# =============================

# Filter out abstract or non-visual nouns
IRRELEVANT_NOUNS = {"person", "expert", "year", "training", "hope", "dream", "mind", "way", "step", "top"}

def filter_irrelevant(items):
    return [item for item in items if item not in IRRELEVANT_NOUNS]

# Build global context from entire script
def build_global_context(script_data):
    global_chars = set()
    global_locs = set()
    global_actions = set()
    global_nouns = set()

    for item in script_data:
        global_chars.update(item.get("characters", []))
        global_locs.update(item.get("locations", []))
        global_actions.update(item.get("actions", []))
        global_nouns.update(item.get("nouns", []))

    return {
        "characters": list(global_chars),
        "locations": list(global_locs),
        "actions": list(global_actions),
        "nouns": list(global_nouns)
    }

# Suggest missing entities — semantic first, global context as fallback
def suggest_missing_with_context(input_sentence, global_context, top_k=2):
    # Step 1: Semantic suggestions from similar reference sentences
    input_embedding = model.encode([input_sentence])
    similarities = cosine_similarity(input_embedding, ref_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    semantic_suggestions = {
        "characters": set(),
        "locations": set(),
        "actions": set(),
        "nouns": set()
    }

    for idx in top_indices:
        match = reference_data[idx]
        for key in semantic_suggestions:
            semantic_suggestions[key].update(match[key])

    # Step 2: Only use global context if semantic gave nothing AND field is empty
    final_suggestions = {}
    for key in ["characters", "locations", "actions", "nouns"]:
        suggested = set(semantic_suggestions[key])
        if len(suggested) == 0:
            # Fallback to global context only if original was empty
            suggested.update(global_context[key])
        final_suggestions[key] = list(suggested)

    return final_suggestions

# Enrich annotation without mutating original
def enrich_annotation_with_context(annotation, global_context):
    # Safe copy
    enriched = {
        "sentence": annotation["sentence"],
        "characters": annotation["characters"].copy(),
        "locations": annotation["locations"].copy(),
        "actions": annotation["actions"].copy(),
        "nouns": annotation["nouns"].copy()
    }

    suggestions = suggest_missing_with_context(enriched["sentence"], global_context)

    for key in ["characters", "locations", "actions", "nouns"]:
        existing = set(enriched[key])
        new_items = [item for item in suggestions[key] if item not in existing]
        enriched[key].extend(new_items)

    return enriched

# Build clean, short, image-friendly Pixabay query
def build_pixabay_query_contextual(annotation, global_context):
    parts = []

    # Characters: max 2 (prefer local, fallback to global)
    chars = annotation["characters"][:2]
    if not chars and global_context["characters"]:
        chars = global_context["characters"][:1]
    if chars:
        parts.append(" ".join(chars))

    # Actions: max 2 from local
    actions = annotation["actions"][:2]
    if actions:
        parts.append(" ".join(actions))

    # Locations + Nouns: combine, filter, dedupe, max 4
    loc_nouns = filter_irrelevant(list(set(annotation["locations"] + annotation["nouns"])))[:4]
    if not loc_nouns:
        fallback = filter_irrelevant(list(set(global_context["locations"] + global_context["nouns"])))[:3]
        loc_nouns = fallback

    if loc_nouns:
        parts.append("in " + " ".join(loc_nouns))

    return " ".join(parts).strip()

# =============================
# STEP 3: Your Script Data (Input)
# =============================

script_data = [
    {
        "sentence": "Alex had always dreamed of reaching the top of Eagle's Peak, a mountain so tall many said it can't be climbed without years of training.",
        "characters": ["Alex"],
        "locations": [],
        "actions": ["dream", "reach", "say", "climb"],
        "nouns": ["top", "mountain", "year", "training"]
    },
    {
        "sentence": "But Alex wasn't an expert.",
        "characters": ["Alex"],
        "locations": [],
        "actions": [],
        "nouns": ["expert"]
    },
    {
        "sentence": "Just a person with a dream and a backpack full of hope.",
        "characters": [],
        "locations": [],
        "actions": [],
        "nouns": ["person", "dream", "backpack", "hope"]
    },
    {
        "sentence": "The first steps were easy.",
        "characters": [],
        "locations": [],
        "actions": [],
        "nouns": ["step"]
    },
    {
        "sentence": "The path was clear.",
        "characters": [],
        "locations": [],
        "actions": [],
        "nouns": ["path"]
    },
    {
        "sentence": "But soon the trail grew steeper.",
        "characters": [],
        "locations": [],
        "actions": ["grow"],
        "nouns": ["trail"]
    },
    {
        "sentence": "Rocks blocked the way.",
        "characters": [],
        "locations": [],
        "actions": ["block"],
        "nouns": ["rock", "way"]
    },
    {
        "sentence": "The wind howled.",
        "characters": [],
        "locations": [],
        "actions": ["howl"],
        "nouns": ["wind"]
    },
    {
        "sentence": "Doubt crept in.",
        "characters": [],
        "locations": [],
        "actions": ["creep"],
        "nouns": []
    }
]

# =============================
# STEP 4: Run Enrichment
# =============================

if __name__ == "__main__":
    print("🌍 Building global context from script...")
    global_context = build_global_context(script_data)
    print(f"Characters: {global_context['characters']}")
    print(f"Locations:  {global_context['locations']}")
    print(f"Actions:    {global_context['actions']}")
    print(f"Nouns:      {global_context['nouns']}\n")
    print("="*70 + "\n")

    enriched_script = []

    for item in script_data:
        enriched = enrich_annotation_with_context(item, global_context)
        query = build_pixabay_query_contextual(enriched, global_context)
        enriched["pixabay_query"] = query
        enriched_script.append(enriched)

        print(f"📝 Sentence: {item['sentence']}")
        print(f"→ Characters: {enriched['characters']}")
        print(f"→ Locations:  {enriched['locations']}")
        print(f"→ Actions:    {enriched['actions']}")
        print(f"→ Nouns:      {enriched['nouns']}")
        print(f"→ ✅ Pixabay Query: \"{query}\"")
        print("-" * 60)

    # Optional: Save output
    with open("enriched_script_clean.json", "w") as f:
        json.dump(enriched_script, f, indent=2)

    print("\n✅ Saved clean enriched script to 'enriched_script_clean.json'")

🌍 Building global context from script...
Characters: ['Alex']
Locations:  []
Actions:    ['block', 'climb', 'reach', 'creep', 'dream', 'grow', 'say', 'howl']
Nouns:      ['path', 'expert', 'wind', 'trail', 'hope', 'top', 'dream', 'rock', 'way', 'mountain', 'training', 'backpack', 'step', 'year', 'person']


📝 Sentence: Alex had always dreamed of reaching the top of Eagle's Peak, a mountain so tall many said it can't be climbed without years of training.
→ Characters: ['Alex']
→ Locations:  []
→ Actions:    ['dream', 'reach', 'say', 'climb']
→ Nouns:      ['top', 'mountain', 'year', 'training', 'expert']
→ ✅ Pixabay Query: "Alex dream reach in mountain"
------------------------------------------------------------
📝 Sentence: But Alex wasn't an expert.
→ Characters: ['Alex']
→ Locations:  []
→ Actions:    ['climb', 'dream', 'reach', 'say']
→ Nouns:      ['expert', 'top', 'mountain', 'training', 'year']
→ ✅ Pixabay Query: "Alex climb dream in mountain"
------------------------------------