In [None]:
# RUN THIS CELL TO SYNC IMPROVEMENTS FROM GITHUB/JULES
import subprocess

def pull_from_github():
    try:
        # This brings the GitHub 'Source of Truth' into your Jove environment
        output = subprocess.check_output(["git", "pull", "origin", "main"], stderr=subprocess.STDOUT)
        print(f"‚úÖ Jove is now synced with GitHub:\n{output.decode()}")
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Sync failed. Make sure you've merged Jules's PR on GitHub first.\nError: {e.output.decode()}")

pull_from_github()

In [None]:
import json
import time
import os
import pandas as pd
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
import PyPDF2
import concurrent.futures
import random

# --- 1. CONFIGURATION ---
PROJECT_ID = "revolut-dev-apps"
LOCATION = "us-central1"
MODEL_NAME = "gemini-2.5-pro"

# Output Files
FILE_PAYLOAD = "cm_risk_payload.jsonl"
FILE_ANALYSIS = "cm_structural_rca.jsonl"
FILE_PROPOSAL = "2026_technical_analysis.md"

# Reference Docs
POLICY_PDF = "CM-pol.pdf"
SWITCHBOARD_PDF = "CMS.pdf"

TOTAL_NARRATIVE_THRESHOLD = 25

# --- 2. HELPERS ---
def extract_pdf_text(file_path):
    if not os.path.exists(file_path): return ""
    text = ""
    try:
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages: text += page.extract_text() + "\n"
        return text
    except: return ""

def process_row(row, model, instruction):
    """
    Process a single row with Auto-Retry logic for 429 errors.
    """
    # 1. Narrative Check
    narrative = f"{row.get('summary', '')} {row.get('description', '')} {row.get('root_cause_details', '')}"
    if len(narrative) < TOTAL_NARRATIVE_THRESHOLD:
        return None

    # 2. Payload Prep
    full_payload = {k: (str(v) if not pd.isna(v) else "") for k, v in row.items()}
    row_id = full_payload.get('id', 'unknown')

    # 3. AI Analysis with Retry Backoff
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = model.generate_content(
                [instruction, json.dumps(full_payload)],
                generation_config=GenerationConfig(response_mime_type="application/json", temperature=0.0)
            )
            res = json.loads(response.text)

            record = {
                "id": str(row_id),
                "meta": {
                    "dept": str(full_payload.get('responsible_department', '')),
                    "severity": str(full_payload.get('severity', '')),
                    "impacted": str(full_payload.get('impacted_entities', ''))
                },
                "analysis": res
            }
            return (full_payload, record)

        except Exception as e:
            error_str = str(e)
            if "429" in error_str or "Resource exhausted" in error_str:
                if attempt < max_retries - 1:
                    sleep_time = (2 ** attempt) + random.uniform(0, 1) # Exponential backoff
                    print(f" -> ‚ö†Ô∏è 429 on {row_id}, retrying in {sleep_time:.1f}s...")
                    time.sleep(sleep_time)
                    continue
            
            # If not 429, or out of retries, log and fail
            print(f" -> ‚ùå Error {row_id}: {e}")
            return None

# --- 3. SYSTEM INSTRUCTIONS ---

RCA_INSTRUCTION = """
Analyze the incident using the full metadata.
RETURN JSON ONLY:
{
  "failure_mode": "[Validation Oversight | Tooling/Automation Failure | Human Error | Process Complexity | Legacy/Backward Compatibility | Guidance Gap]",
  "technical_root_cause": "A concise, technical 5-Why summary.",
  "policy_gap_classification": "Classify using this strict logic:
   - IGNORED: A mandate exists in the Policy/Playbook (even high-level), but was not executed (e.g., 'Local variations must be documented' was skipped).
   - INSUFFICIENT: The policy was followed, but the incident still occurred because the rule was too vague (e.g., 'Risk Assessment' exists but doesn't mention 'Feature Flags').
   - MISSING: No rule exists for this domain at all.",
  "policy_gap_logic": "Explain the gap. If IGNORED, mention what requirement was missed.",
  "preventative_control": "Specific technical or procedural control to prevent recurrence."
}
"""

SYNTHESIS_INSTRUCTION = """
Perform a Technical Risk Audit of the aggregated RCA data against the provided Policy Library.

OUTPUT FORMAT: Strict Technical Report (Markdown). No conversational text.

REQUIRED SECTIONS:
1. PARETO ANALYSIS: Top 2 Failure Modes by frequency & risk.
2. COMPLIANCE AUDIT: Table of [ID | Failure Mode | Gap Type | Policy Reference].
   - If IGNORED: Cite the specific Policy/Switchboard section violated.
   - If INSUFFICIENT: Explain the specific deficiency in the current text.
3. SWITCHBOARD LOGIC GAPS: Identify specific criteria in the Switchboard that are failing.
4. DOCUMENT REDLINES: Proposed text amendments ("Current Text" -> "Proposed Text").
"""

# --- 4. SQL QUERY ---
RCA_QUERY = """
SELECT
    id, report_id, summary, description, root_cause_details,
    responsible_department, assignee_entity, impacted_entities,
    operating_type, brm_department_type, risk_type_id,
    additional_risk_type_ids, cause_category_l1, cause_category_l2,
    cause_category_l3, severity, impact_customer, impact_regulator,
    issue_state, issue_state_category, reporting_date
FROM risk.risk_issues
WHERE (risk_type_id = 186 OR additional_risk_type_ids LIKE '%186%')
AND issue_state NOT IN ('invalid', 'deleted')
AND reporting_date >= TIMESTAMP '2024-01-01 00:00:00 UTC'
ORDER BY reporting_date DESC
"""

def run_production_pipeline():
    print(f"üöÄ Starting Production Pipeline ({LOCATION})...")
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    model = GenerativeModel(MODEL_NAME)

    # A. DATA EXTRACTION
    print("Step 1: Extracting Full Dataset...")
    try:
        with zeus() as cur: df = execute_sql(cur, RCA_QUERY, None)
        print(f" -> Records found: {len(df)}")
    except Exception as e: print(f"ERROR: {e}"); return

    # B. STRUCTURAL ANALYSIS (SAFE PARALLEL)
    print(f"Step 2: Processing RCAs (Max Workers=5)...")
    analysis_results = []
    records = df.to_dict('records')

    with open(FILE_PAYLOAD, 'w') as f_pay, open(FILE_ANALYSIS, 'w') as f_ana:
        # Reduced workers to 5 to stay under quota
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            results = executor.map(lambda r: process_row(r, model, RCA_INSTRUCTION), records)

            count = 0
            for res in results:
                if res:
                    payload, record = res
                    f_pay.write(json.dumps(payload) + '\n')
                    f_ana.write(json.dumps(record) + '\n')
                    analysis_results.append(record)
                    
                    count += 1
                    if count % 20 == 0:
                        print(f" -> Processed {count}/{len(df)}...")

    # C. TECHNICAL SYNTHESIS
    print("Step 3: Generating Technical Report...")
    policy_text = extract_pdf_text(POLICY_PDF)
    switchboard_text = extract_pdf_text(SWITCHBOARD_PDF)

    synthesis_payload = f"""
    {SYNTHESIS_INSTRUCTION}
    
    --- REFERENCE LIBRARY ---
    [POLICY v1.3]: {policy_text[:50000]} 
    [SWITCHBOARD]: {switchboard_text[:30000]}
    
    --- RCA DATASET ---
    {json.dumps(analysis_results)}
    """

    try:
        final_report = model.generate_content(
            synthesis_payload,
            generation_config=GenerationConfig(temperature=0.1)
        )
        with open(FILE_PROPOSAL, 'w') as f_rep:
            f_rep.write(final_report.text)

    except Exception as e: print(f"Synthesis Error: {e}")

    # D. VERIFICATION
    print("\n‚úÖ PIPELINE COMPLETE. OUTPUT FILES:")
    for fname in [FILE_PAYLOAD, FILE_ANALYSIS, FILE_PROPOSAL]:
        if os.path.exists(fname):
            size = os.path.getsize(fname)
            print(f"  [CREATED] {fname:<30} Size: {size/1024:.1f} KB")

if __name__ == "__main__":
    run_production_pipeline()