# Agentic Pipeline


### Setup and Imports

In [0]:
!pip install instructor
!pip install azure.identity openai
dbutils.library.restartPython()

In [0]:
import sys
import os
from datetime import datetime
import pandas as pd
from pyspark.sql import SparkSession
from typing import List, Dict, Any
from core.pipeline import AgenticPipeline
from core.bottleneck_registry import BottleneckRegistry
from core.models import EvidenceSpan, FinalEvidence
from agents.pre_filter import PreFilterAgent
from agents.classifier import MultiBottleneckClassifier, ComparativeClassifier
from agents.causality_validator import CausalityValidator
from agents.arbitrator import EvidenceArbitrator
from utils.llm_service import AzureOpenAIService
import config

### Initialize Services and Pipeline

In [0]:
llm_service = AzureOpenAIService()

pre_filter = PreFilterAgent(llm_service)
classifier = MultiBottleneckClassifier(llm_service)
comparative_classifier = ComparativeClassifier(llm_service)
causality_validator = CausalityValidator(llm_service)
arbitrator = EvidenceArbitrator(llm_service)

registry = BottleneckRegistry()

pipeline = AgenticPipeline(
    pre_filter=pre_filter,
    classifier=classifier,
    causality_validator=causality_validator,
    arbitrator=arbitrator,
    bottleneck_registry=registry,
    comparative_classifier=comparative_classifier
)

print("Agentic pipeline initialized successfully")
print(f"Using model: {config.LLM_MODEL}")
print(f"Temperature: {config.TEMPERATURE}")
print(f"Classification threshold: {config.CLASSIFICATION_THRESHOLD}")

### Load Input Data

In [0]:
# Define input parameters
COUNTRY_CODES = ['MLI', 'BFA'] 
MIN_CHUNK_LENGTH = 1000

# Load chunks from database
query = f"""
SELECT 
    c.document_id,
    c.chunk_id,
    c.chunk_text,
    c.page_number,
    d.document_name,
    d.country_code,
    d.document_type,
    d.fiscal_year
FROM prd_mega.sboost4.per_pfr_chunks c
JOIN prd_corpdata.dm_reference_gold.v_dim_imagebank_document d
    ON c.document_id = d.document_id
WHERE d.country_code IN ({','.join([f"'{cc}'" for cc in COUNTRY_CODES])})
    AND LENGTH(c.chunk_text) >= {MIN_CHUNK_LENGTH}
    AND d.document_type IN ('PER', 'PFR')
ORDER BY c.document_id, c.chunk_id
LIMIT 5000  -- Limit for testing
"""

df_chunks = spark.sql(query).toPandas()
print(f"Loaded {len(df_chunks)} chunks from {df_chunks['document_id'].nunique()} documents")
print(f"Countries: {df_chunks['country_code'].unique()}")
print(f"Document types: {df_chunks['document_type'].unique()}")

In [0]:
df_chunks.head()

### View Available Bottlenecks

In [0]:
all_bottlenecks = registry.get_all_bottlenecks()

print("Registered Bottlenecks:")

for b in all_bottlenecks:
    print(f"\nBottleneck {b['id']}: {b['name']}")
    print(f"  Category: {b['category']}")
    print(f"  Description: {b['description'][:150]}...")
    if b.get('examples'):
        print(f"  Example: {b['examples'][0][:100]}...")

## Stage 1: Pre-Filtering

Fast filtering to identify potentially relevant text spans

In [0]:
BATCH_SIZE = 100
chunks_sample = df_chunks.head(BATCH_SIZE).to_dict('records')

print(f"Pre-filtering {len(chunks_sample)} chunks...")
print("This stage quickly identifies potentially relevant text spans.\n")

In [0]:
evidence_spans = pre_filter.filter_batch(chunks_sample)

print(f"\nPre-filtering Results:")
print(f"  Input chunks: {len(chunks_sample)}")
print(f"  Evidence spans found: {len(evidence_spans)}")
print(f"  Reduction rate: {(1 - len(evidence_spans)/len(chunks_sample))*100:.1f}%")

docs_with_evidence = len(set(span.document_id for span in evidence_spans))
print(f"  Documents with evidence: {docs_with_evidence}")

In [0]:
if evidence_spans:
    print("\nSample Evidence Spans:")
    print("="*50)
    for i, span in enumerate(evidence_spans[:3]):
        print(f"\n[Span {i+1}]")
        print(f"  Document: {span.document_name[:50]}...")
        print(f"  Relevance: {span.relevance_score:.2f}")
        print(f"  Text: {span.text[:200]}...")
        print(f"  Potential bottlenecks: {span.potential_bottlenecks}")

## Stage 2: Multi-Bottleneck Classification

Classify each evidence span against ALL bottlenecks simultaneously

In [0]:
# Classify evidence spans
classification_results = []

print("Classifying evidence spans against all bottlenecks...")
print("This prevents misclassification by comparing all options.\n")

for i, span in enumerate(evidence_spans[:10]):  # Process first 10 for demo
    print(f"\nClassifying span {i+1}/{min(10, len(evidence_spans))}...")
    
    # Classify against all bottlenecks
    result = classifier.classify(span, all_bottlenecks)
    classification_results.append({
        'span': span,
        'classification': result
    })
    
    if result.top_matches:
        print(f"  Top match: Bottleneck {result.top_matches[0].bottleneck_id} "
              f"(score: {result.top_matches[0].score:.2f})")
        if len(result.top_matches) > 1:
            print(f"  Second match: Bottleneck {result.top_matches[1].bottleneck_id} "
                  f"(score: {result.top_matches[1].score:.2f})")
        print(f"  Ambiguous: {result.is_ambiguous}")
        print(f"  Recommendation: {result.recommendation}")

In [0]:
# Analyze classification distribution
bottleneck_counts = {}
ambiguous_count = 0
rejected_count = 0

for result in classification_results:
    if result['classification'].top_matches:
        top_match = result['classification'].top_matches[0]
        bottleneck_counts[top_match.bottleneck_id] = bottleneck_counts.get(top_match.bottleneck_id, 0) + 1
    else:
        rejected_count += 1
    
    if result['classification'].is_ambiguous:
        ambiguous_count += 1

print("\nClassification Summary:")
print("="*50)
print(f"Total classified: {len(classification_results)}")
print(f"Rejected (no match): {rejected_count}")
print(f"Ambiguous: {ambiguous_count}")
print("\nBottleneck distribution:")
for bid, count in sorted(bottleneck_counts.items()):
    print(f"  Bottleneck {bid}: {count} matches")

## Stage 3: Handle Ambiguous Classifications

Use comparative classifier for ambiguous cases

In [0]:
# Find ambiguous cases
ambiguous_cases = [
    r for r in classification_results 
    if r['classification'].is_ambiguous and len(r['classification'].top_matches) >= 2
]

if ambiguous_cases:
    print(f"Found {len(ambiguous_cases)} ambiguous cases. Resolving with comparative classifier...\n")
    
    for i, case in enumerate(ambiguous_cases[:3]):  # Process first 3 for demo
        span = case['span']
        top_two = case['classification'].top_matches[:2]
        
        print(f"\nCase {i+1}: Comparing bottlenecks {top_two[0].bottleneck_id} vs {top_two[1].bottleneck_id}")
        print(f"  Original scores: {top_two[0].score:.2f} vs {top_two[1].score:.2f}")
        
        # Get bottleneck definitions
        b1 = registry.get_bottleneck(top_two[0].bottleneck_id)
        b2 = registry.get_bottleneck(top_two[1].bottleneck_id)
        
        # Run comparative classification
        winner = comparative_classifier.compare_top_matches(span, b1, b2)
        print(f"  Winner: Bottleneck {winner}")
else:
    print("No ambiguous cases found in this batch.")

## Stage 4: Causality Validation

Validate that evidence shows actual causation, not just correlation

In [0]:
# Validate causality for top matches
validated_results = []


for result in classification_results[:5]:
    if result['classification'].top_matches:
        span = result['span']
        top_match = result['classification'].top_matches[0]
        
        print(f"\nValidating evidence for bottleneck {top_match.bottleneck_id}...")
        
        # Get surrounding context
        context = {
            'before': "[]",
            'after': "[]",
            'document_type': 'PER',
            'section': 'Executive Summary'
        }
        
        # Validate causality (claims)
        causality_result = causality_validator.validate(span, context)
        
        print(f"  Stated causation: {causality_result.is_stated_causation}")
        print(f"  Inferred causation: {causality_result.is_inferred_causation}")
        print(f"  Confidence: {causality_result.confidence:.2f}")
        print(f"  Validity: {causality_result.is_valid}")
        
        if causality_result.is_valid:
            validated_results.append({
                'span': span,
                'bottleneck_id': top_match.bottleneck_id,
                'classification': result['classification'],
                'causality': causality_result
            })

print(f"\n\nCausality Validation Summary:")
print(f"  Total validated: {len(classification_results[:5])}")
print(f"  Passed validation: {len(validated_results)}")
print(f"  Rejection rate: {(1 - len(validated_results)/min(5, len(classification_results)))*100:.1f}%")

## Stage 5: Evidence Arbitration

Final quality control and scoring

In [0]:
# Arbitrate validated evidence
final_evidence = []

print("Running final evidence arbitration...")
print("This ensures only high-quality, specific evidence passes.\n")

for validated in validated_results:
    span = validated['span']
    bottleneck_id = validated['bottleneck_id']
    bottleneck = registry.get_bottleneck(bottleneck_id)
    
    print(f"\nArbitrating evidence for bottleneck {bottleneck_id}...")
    
    # Run arbitration
    arbitration_result = arbitrator.arbitrate(
        span=span,
        bottleneck=bottleneck,
        classification_result=validated['classification'],
        causality_result=validated['causality']
    )
    
    print(f"  Final decision: {arbitration_result.final_decision}")
    print(f"  Quality score: {arbitration_result.quality_score:.2f}")
    print(f"  Specificity: {arbitration_result.specificity_score:.2f}")
    
    if arbitration_result.final_decision == 'ACCEPT':
        # Create final evidence
        evidence = FinalEvidence(
            document_id=span.document_id,
            document_name=span.document_name,
            chunk_id=span.chunk_id,
            page_number=span.page_number,
            bottleneck_id=bottleneck_id,
            bottleneck_name=bottleneck['name'],
            evidence_text=span.text,
            summary=arbitration_result.evidence_summary,
            key_points=arbitration_result.key_points,
            quality_score=arbitration_result.quality_score,
            confidence=arbitration_result.quality_score,
            is_causal=validated['causality'].is_stated_causation,
            specificity_score=arbitration_result.specificity_score
        )
        final_evidence.append(evidence)

In [0]:
# Summary of final evidence
print("\n" + "="*60)
print("FINAL EVIDENCE SUMMARY")
print("="*60)
print(f"\nTotal final evidence pieces: {len(final_evidence)}")

if final_evidence:
    # Group by bottleneck
    evidence_by_bottleneck = {}
    for ev in final_evidence:
        if ev.bottleneck_id not in evidence_by_bottleneck:
            evidence_by_bottleneck[ev.bottleneck_id] = []
        evidence_by_bottleneck[ev.bottleneck_id].append(ev)
    
    print("\nEvidence by bottleneck:")
    for bid, evidences in sorted(evidence_by_bottleneck.items()):
        avg_quality = sum(e.quality_score for e in evidences) / len(evidences)
        print(f"  Bottleneck {bid}: {len(evidences)} pieces (avg quality: {avg_quality:.2f})")
    
    # Show top evidence
    print("\nTop quality evidence:")
    sorted_evidence = sorted(final_evidence, key=lambda x: x.quality_score, reverse=True)
    for i, ev in enumerate(sorted_evidence[:3]):
        print(f"\n  [{i+1}] Bottleneck {ev.bottleneck_id} (Quality: {ev.quality_score:.2f})")
        print(f"      Summary: {ev.summary[:150]}...")
        print(f"      Document: {ev.document_name[:50]}...")


Run the complete agentic pipeline end-to-end

In [0]:
# Full pipeline execution function
def run_agentic_pipeline(chunks: List[Dict], 
                         bottleneck_ids: List[str] = None,
                         save_to_db: bool = False) -> List[FinalEvidence]:
    """
    Run the complete agentic pipeline
    
    Args:
        chunks: List of document chunks
        bottleneck_ids: Optional list of bottleneck IDs to process (None = all)
        save_to_db: Whether to save results to database
    
    Returns:
        List of final evidence pieces
    """
    print("\n" + "#"*70)
    print("RUNNING FULL AGENTIC PIPELINE")
    print("#"*70)
    
    # Get bottlenecks to process
    if bottleneck_ids:
        bottlenecks = [registry.get_bottleneck(bid) for bid in bottleneck_ids]
    else:
        bottlenecks = registry.get_all_bottlenecks()
    
    print(f"\nProcessing {len(chunks)} chunks")
    print(f"Target bottlenecks: {[b['id'] for b in bottlenecks]}")
    
    # Run pipeline
    final_evidence = pipeline.process_documents(
        chunks=chunks,
        bottleneck_ids=[b['id'] for b in bottlenecks]
    )
    
    print(f"\nPipeline complete: {len(final_evidence)} final evidence pieces")
    
    # Save to database if requested
    if save_to_db and final_evidence:
        # Convert to DataFrame
        df_evidence = pd.DataFrame([ev.dict() for ev in final_evidence])
        
        # Save to Spark table
        spark_df = spark.createDataFrame(df_evidence)
        table_name = f"prd_mega.sboost4.agentic_evidence_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        spark_df.write.mode('overwrite').saveAsTable(table_name)
        print(f"Saved to table: {table_name}")
    
    return final_evidence

In [0]:
# Run full pipeline on sample data
PIPELINE_SAMPLE_SIZE = 200
sample_chunks = df_chunks.head(PIPELINE_SAMPLE_SIZE).to_dict('records')

# Process specific bottlenecks or all
TARGET_BOTTLENECKS = ["1.1", "2.1", "3.1", "6.1"]  # Or None for all

final_results = run_agentic_pipeline(
    chunks=sample_chunks,
    bottleneck_ids=TARGET_BOTTLENECKS,
    save_to_db=False  # Set to True for production
)

## Performance Analysis

In [0]:
# Analyze pipeline performance metrics
def analyze_pipeline_metrics(chunks_input: int, final_evidence: List[FinalEvidence]):
    """
    Analyze performance metrics of the agentic pipeline
    """
    print("\nPIPELINE PERFORMANCE METRICS")
    print("="*50)
    
    # Overall metrics
    print(f"\nOverall Performance:")
    print(f"  Input chunks: {chunks_input}")
    print(f"  Final evidence: {len(final_evidence)}")
    print(f"  Overall precision rate: {len(final_evidence)/chunks_input*100:.2f}%")
    
    if final_evidence:
        # Quality metrics
        quality_scores = [ev.quality_score for ev in final_evidence]
        specificity_scores = [ev.specificity_score for ev in final_evidence]
        
        print(f"\nQuality Metrics:")
        print(f"  Average quality score: {sum(quality_scores)/len(quality_scores):.2f}")
        print(f"  Min quality score: {min(quality_scores):.2f}")
        print(f"  Max quality score: {max(quality_scores):.2f}")
        print(f"  Average specificity: {sum(specificity_scores)/len(specificity_scores):.2f}")
        
        # Causality metrics
        causal_count = sum(1 for ev in final_evidence if ev.is_causal)
        print(f"\nCausality Analysis:")
        print(f"  Evidence with stated causation: {causal_count}")
        print(f"  Evidence with correlation only: {len(final_evidence) - causal_count}")
        print(f"  Causal percentage: {causal_count/len(final_evidence)*100:.1f}%")
        
        # Bottleneck distribution
        bottleneck_dist = {}
        for ev in final_evidence:
            bottleneck_dist[ev.bottleneck_id] = bottleneck_dist.get(ev.bottleneck_id, 0) + 1
        
        print(f"\nBottleneck Distribution:")
        for bid, count in sorted(bottleneck_dist.items()):
            print(f"  {bid}: {count} evidence pieces ({count/len(final_evidence)*100:.1f}%)")

# Run analysis
analyze_pipeline_metrics(len(sample_chunks), final_results)

## Export Results

In [0]:
# Export results to Excel for review
if final_results:
    # Convert to DataFrame
    df_results = pd.DataFrame([ev.dict() for ev in final_results])
    
    # Add metadata columns
    df_results['pipeline_type'] = 'agentic'
    df_results['processing_date'] = datetime.now()
    
    # Export to Excel
    output_dir = "/Volumes/prd_mega/sboost4/vboost4/Documents/input/Bottleneck/"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"{output_dir}agentic_results_{timestamp}.xlsx"
    
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        # Main results
        df_results.to_excel(writer, sheet_name='Final Evidence', index=False)
        
        # Summary by bottleneck
        summary_df = df_results.groupby('bottleneck_id').agg({
            'evidence_text': 'count',
            'quality_score': 'mean',
            'specificity_score': 'mean',
            'is_causal': 'sum'
        }).round(2)
        summary_df.columns = ['count', 'avg_quality', 'avg_specificity', 'causal_count']
        summary_df.to_excel(writer, sheet_name='Summary')
        
        # Top evidence by quality
        top_evidence = df_results.nlargest(20, 'quality_score')[[
            'bottleneck_id', 'summary', 'quality_score', 'specificity_score', 'document_name'
        ]]
        top_evidence.to_excel(writer, sheet_name='Top Evidence', index=False)
    
    print(f"\nResults exported to: {output_file}")
    print(f"  Total evidence: {len(df_results)} pieces")
    print(f"  Bottlenecks covered: {df_results['bottleneck_id'].nunique()}")
else:
    print("No results to export")