In [2]:
import operator
import uuid
import hashlib
import json
from enum import Enum
from typing import List, Dict, Optional, Any

from pydantic import BaseModel, Field
from typing_extensions import Annotated

from langgraph.graph import StateGraph, END
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# --- 1. Pydantic State Models (Unchanged) ---
# The core state definition remains the same.

class Verdict(str, Enum):
    PRESUMED_INNOCENT = "Presumed_Innocent"
    SUSPICIOUS = "Suspicious"
    MALICIOUS = "Malicious"
    BENIGN = "Benign"

class AnalysisPhase(str, Enum):
    TRIAGE = "Triage"
    INTERROGATION = "Interrogation"
    FINALIZING = "Finalizing"

class InvestigationTask(BaseModel):
    object_id: int = Field(..., description="The PDF object number to investigate. Use 0 if unknown.")
    priority: int = Field(..., description="Priority of the task (1=Highest, 10=Lowest).")
    reason: str = Field(..., description="Why this object is being investigated (e.g., 'Contains /JS keyword').")

# ... (Other Pydantic models from the previous version remain unchanged) ...
class NarrativeCoherence(BaseModel):
    score: float = Field(1.0, description="Coherence score from 0.0 (deceptive) to 1.0 (coherent).")
    notes: List[str] = Field(default_factory=list, description="Observations that affect coherence.")

class AttackChainLink(BaseModel):
    source_object: int = Field(..., description="The PDF object that initiates the action.")
    action: str = Field(..., description="The relationship (e.g., 'Executes', 'References', 'Decodes').")
    target_object: int = Field(..., description="The PDF object that is the target of the action.")
    description: str = Field(..., description="Human-readable summary of the link.")

class ExtractedArtifact(BaseModel):
    source_object_id: int = Field(..., description="The PDF object from which this was extracted.")
    content_decoded: str = Field(..., description="The decoded/deobfuscated content.")
    analysis_notes: List[str] = Field(default_factory=list, description="Notes from the analysis of this artifact.")

class IndicatorOfCompromise(BaseModel):
    value: str = Field(..., description="The value of the indicator (e.g., the URL).")
    source_object_id: int = Field(..., description="The PDF object where this IoC was discovered.")
    context: str = Field(..., description="The line or code snippet where the IoC was found.")

class EvidenceLocker(BaseModel):
    structural_summary: Dict[str, Any] = Field(default_factory=dict, description="Raw, parsed output from the initial triage tool (e.g., pdfid).")
    attack_chain: List[AttackChainLink] = Field(default_factory=list)
    extracted_artifacts: Dict[int, ExtractedArtifact] = Field(default_factory=dict)
    indicators_of_compromise: List[IndicatorOfCompromise] = Field(default_factory=list)

class ForensicCaseFileInput(BaseModel):
    file_path: str = Field(..., description="The local path to the PDF file to be analyzed.")

class ForensicCaseFile(BaseModel):
    file_path: str
    file_hash_sha256: Optional[str] = None
    analysis_session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    verdict: Verdict = Field(Verdict.PRESUMED_INNOCENT)
    phase: AnalysisPhase = Field(AnalysisPhase.TRIAGE)
    narrative_coherence: NarrativeCoherence = Field(default_factory=NarrativeCoherence)
    investigation_queue: List[InvestigationTask] = Field(default_factory=list)
    evidence: EvidenceLocker = Field(default_factory=EvidenceLocker)
    analysis_trail: Annotated[List[str], operator.add] = Field(default_factory=list)
    errors: Annotated[List[str], operator.add] = Field(default_factory=list)
    final_report: Optional[str] = None


# --- 2. LLM Interaction Setup ---

class TriageAnalysis(BaseModel):
    """The required JSON structure for the LLM's triage analysis."""
    verdict: Verdict = Field(..., description="Your expert verdict on the file based on the triage data.")
    phase: AnalysisPhase = Field(..., description="The next analysis phase based on your verdict.")
    is_suspicious: bool = Field(..., description="A simple boolean flag indicating if the file warrants further investigation.")
    investigation_queue: List[InvestigationTask] = Field(..., description="A priority-ordered list of tasks for the next phase.")
    analysis_trail: str = Field(..., description="A single, concise log entry summarizing your findings and decision, written in your persona's voice.")
    narrative_coherence_notes: List[str] = Field(..., description="Notes on why the file's 'character' seems suspicious or benign.")


# The prompt template that instructs the LLM to act as Dr. Evelyn Reed
TRIAGE_PROMPT_TEMPLATE = """
You are Dr. Evelyn Reed, a Digital Pathologist. Your default state is dismissal. A file is innocent until it gives you a compelling reason to doubt its character. Your task is to perform an initial triage based on a structural analysis from the 'pdfid' tool.

Analyze the following pdfid output. Based *only* on this information, determine if the file has betrayed itself. Look for signs of deception: unnecessary complexity or capabilities that are incoherent with a simple document's purpose (e.g., automatic actions, JavaScript, launch actions).

**PDFID Output:**
```
{pdfid_output}
```

Based on your analysis, provide your expert judgment in the required JSON format.
- If suspicious, set the verdict to SUSPICIOUS, the phase to INTERROGATION, and create a prioritized queue of investigation tasks for the most suspicious keywords.
- If it appears benign and 'boring', set the verdict to BENIGN, the phase to FINALIZING, and leave the queue empty.
Your analysis trail entry should be a concise summary of your decision.
"""

# --- 3. Tool Simulation & Helper Functions (Unchanged) ---

def run_pdfid_simulation(file_path: str) -> str:
    print(f"[*] SIMULATING 'pdfid' on: {file_path}")
    return """
 PDF Header: %PDF-1.4
 obj                  10
 endobj                9
 stream                3
 endstream             3
 xref                  1
 trailer               1
 startxref             1
 /Page                 1
 /Encrypt              0
 /JS                   2
 /JavaScript           2
 /AA                   1
 /OpenAction           1
 /AcroForm             0
 /JBIG2Decode          0
 /RichMedia            0
 /Launch               1
 /EmbeddedFile         0
 /XFA                  0
 /Colors > 2^24        0
    """

def get_file_hash(file_path: str) -> str:
    print(f"[*] SIMULATING hash calculation for: {file_path}")
    sha256_hash = hashlib.sha256()
    sha256_hash.update(file_path.encode('utf-8'))
    return sha256_hash.hexdigest()

# --- 4. LLM-Powered Graph Node ---

def triage_node_llm(state: ForensicCaseFile) -> Dict[str, Any]:
    """
    This node uses an LLM to perform triage instead of manual rules.
    """
    file_path = state.file_path
    print("\n--- Running LLM-Powered Triage Node ---")

    # 1. Perform initial actions
    file_hash = get_file_hash(file_path)
    pdfid_output = run_pdfid_simulation(file_path)

    # 2. Set up the LLM chain
    # NOTE: To run for real, you need an OPENAI_API_KEY in your environment
    # and to install langchain-openai (`pip install langchain-openai`)
    prompt = ChatPromptTemplate.from_template(TRIAGE_PROMPT_TEMPLATE)
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    structured_llm = llm.with_structured_output(TriageAnalysis)
    chain = prompt | structured_llm

    # 3. Invoke the LLM
    print("[*] Asking LLM (Dr. Reed) for analysis...")
    # llm_response = chain.invoke({"pdfid_output": pdfid_output}) # <-- REAL LLM CALL

    # --- MOCKED LLM CALL FOR RUNNABLE EXAMPLE ---
    mock_response_json = {
        "verdict": "Suspicious",
        "phase": "Interrogation",
        "is_suspicious": True,
        "investigation_queue": [
            {"object_id": 0, "priority": 1, "reason": "Contains /JS and /JavaScript keywords, indicating active content."},
            {"object_id": 0, "priority": 2, "reason": "Contains /OpenAction and /AA keywords, suggesting automatic execution."},
            {"object_id": 0, "priority": 3, "reason": "Contains /Launch keyword, a highly suspicious capability."}
        ],
        "analysis_trail": "Initial scan reveals high-potential for action via JavaScript and auto-execution triggers. The file's character is suspect. Suspending innocence and proceeding to interrogation.",
        "narrative_coherence_notes": ["File contains multiple high-risk keywords (/JS, /OpenAction, /Launch) that are incoherent with a simple document's purpose."]
    }
    llm_response = TriageAnalysis(**mock_response_json)
    # --- END MOCK ---

    print(f"[*] LLM analysis received. Verdict: {llm_response.verdict}")

    # 4. Prepare state updates from the LLM's structured response
    updates = {
        "file_hash_sha256": file_hash,
        "analysis_trail": [
            f"Triage started for {file_path} (SHA256: {file_hash[:12]}...).",
            llm_response.analysis_trail # Append the LLM's own log entry
        ],
        "verdict": llm_response.verdict,
        "phase": llm_response.phase,
    }

    if llm_response.is_suspicious:
        updates["investigation_queue"] = llm_response.investigation_queue
    
    # Update nested models correctly
    updated_evidence = state.evidence.model_copy(deep=True)
    updated_evidence.structural_summary = {
        line.split()[0]: int(line.split()[1])
        for line in pdfid_output.strip().split('\n')
        if line.split() and line.split()[0].startswith('/') and line.split()[1].isdigit()
    }

    updated_coherence = state.narrative_coherence.model_copy(deep=True)
    if llm_response.is_suspicious:
        updated_coherence.score = 0.5 # Lower score for suspicious files
        updated_coherence.notes.extend(llm_response.narrative_coherence_notes)
    
    updates["evidence"] = updated_evidence
    updates["narrative_coherence"] = updated_coherence
    
    return updates

# --- 5. Graph Definition and Execution ---

if __name__ == "__main__":
    print("--- Setting up Forensic Analysis Graph ---")
    
    workflow = StateGraph(ForensicCaseFile)
    workflow.add_node("triage", triage_node_llm)
    workflow.set_entry_point("triage")
    workflow.add_edge("triage", END)
    app = workflow.compile()

    pdf_to_analyze = "87c740d2b7c22f9ccabbdef412443d166733d4d925da0e8d6e5b310ccfc89e13.pdf"
    inputs = ForensicCaseFileInput(file_path=pdf_to_analyze)
    
    final_state_dict = app.invoke(inputs.model_dump())
    final_state = ForensicCaseFile(**final_state_dict)

    print("\n\n--- Final Graph State ---")
    print(final_state.model_dump_json(indent=2))


--- Setting up Forensic Analysis Graph ---

--- Running LLM-Powered Triage Node ---
[*] SIMULATING hash calculation for: 87c740d2b7c22f9ccabbdef412443d166733d4d925da0e8d6e5b310ccfc89e13.pdf
[*] SIMULATING 'pdfid' on: 87c740d2b7c22f9ccabbdef412443d166733d4d925da0e8d6e5b310ccfc89e13.pdf
[*] Asking LLM (Dr. Reed) for analysis...
[*] LLM analysis received. Verdict: Verdict.SUSPICIOUS


--- Final Graph State ---
{
  "file_path": "87c740d2b7c22f9ccabbdef412443d166733d4d925da0e8d6e5b310ccfc89e13.pdf",
  "file_hash_sha256": "71d0e76d873dc946a4c40397f4c0b3c53c36943d9c9af8087b60fd229df6cccb",
  "analysis_session_id": "0f2738dd-cc0f-4326-b196-498a4518073a",
  "verdict": "Suspicious",
  "phase": "Interrogation",
  "narrative_coherence": {
    "score": 0.5,
    "notes": [
      "File contains multiple high-risk keywords (/JS, /OpenAction, /Launch) that are incoherent with a simple document's purpose."
    ]
  },
  "investigation_queue": [
    {
      "object_id": 0,
      "priority": 1,
      "reas