# Structured Pipeline - Main Notebook
## PFM Bottleneck Analysis Pipeline

This notebook demonstrates the structured pipeline approach for bottleneck identification and validation.

### Setup and Imports

In [0]:
# Install required packages
!pip install instructor
!pip install azure.identity openai
dbutils.library.restartPython()

In [0]:
# Imports
import sys
import os
from datetime import datetime
import pandas as pd
from pyspark.sql import SparkSession

# Add the structured_pipeline directory to path
sys.path.append('/Workspace/structured_pipeline')

# Import pipeline components
from core.pipeline import BottleneckPipeline
from core.services import AzureOpenAIService
from core.config import LLM_MODEL, TEMPERATURE
from core.database import DatabaseManager

### Initialize Services

In [0]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("BottleneckAnalysis") \
    .getOrCreate()

# Initialize services
llm_service = AzureOpenAIService()
db_manager = DatabaseManager(spark)

# Initialize pipeline
pipeline = BottleneckPipeline(llm_service, db_manager)

print("Pipeline initialized successfully")
print(f"Using model: {LLM_MODEL}")
print(f"Temperature: {TEMPERATURE}")

### Load Input Data

In [0]:
# Define input parameters
COUNTRY_CODES = ['MLI', 'BFA']  # Mali and Burkina Faso
MIN_CHUNK_LENGTH = 1000

# Load chunks from database
query = f"""
SELECT 
    c.document_id,
    c.chunk_id,
    c.chunk_text,
    c.page_number,
    d.document_name,
    d.country_code
FROM prd_mega.sboost4.per_pfr_chunks c
JOIN prd_corpdata.dm_reference_gold.v_dim_imagebank_document d
    ON c.document_id = d.document_id
WHERE d.country_code IN ({','.join([f"'{cc}'" for cc in COUNTRY_CODES])})
    AND LENGTH(c.chunk_text) >= {MIN_CHUNK_LENGTH}
    AND d.document_type IN ('PER', 'PFR')
ORDER BY c.document_id, c.chunk_id
"""

df_chunks = spark.sql(query).toPandas()
print(f"Loaded {len(df_chunks)} chunks from {df_chunks['document_id'].nunique()} documents")
print(f"Countries: {df_chunks['country_code'].unique()}")

In [0]:
# Preview the data
df_chunks.head()

### Select Bottleneck to Process

In [0]:
# List available bottlenecks
available_bottlenecks = pipeline.list_bottlenecks()

print("Available bottlenecks:")
for bottleneck_id, info in available_bottlenecks.items():
    print(f"  {bottleneck_id}: {info['name']}")
    print(f"    Description: {info['description'][:100]}...")
    print()

In [0]:
# Select bottleneck to process
BOTTLENECK_ID = "1.1"  # Change this to process different bottlenecks

print(f"Selected bottleneck: {BOTTLENECK_ID}")
print(f"Name: {available_bottlenecks[BOTTLENECK_ID]['name']}")

## Stage 1: Extraction

Extract potential evidence from document chunks

In [0]:
# Option 1: Process a sample for testing
SAMPLE_SIZE = 10  # Adjust as needed
df_sample = df_chunks.sample(n=min(SAMPLE_SIZE, len(df_chunks)), random_state=42)

print(f"Processing {len(df_sample)} chunks for bottleneck {BOTTLENECK_ID}")

In [0]:
# Run extraction
df_extracted = pipeline.run_extraction(
    bottleneck_id=BOTTLENECK_ID,
    chunks=df_sample.to_dict('records'),
    save_to_db=False  # Set to True to save to database
)

print(f"\nExtraction complete:")
print(f"  Total chunks processed: {len(df_sample)}")
print(f"  Chunks with evidence: {len(df_extracted)}")
print(f"  Extraction rate: {len(df_extracted)/len(df_sample)*100:.1f}%")

In [0]:
# View extracted evidence
if len(df_extracted) > 0:
    print("Sample of extracted evidence:")
    for idx, row in df_extracted.head(3).iterrows():
        print(f"\n--- Evidence {idx+1} ---")
        print(f"Document: {row['document_name'][:50]}...")
        print(f"Evidence: {row['evidence_text'][:200]}...")
        print(f"Relevance: {row['relevance_score']:.2f}")
        print(f"Reasoning: {row['reasoning'][:150]}...")

## Stage 2: Validation

Validate extracted evidence using detailed criteria

In [0]:
# Run validation on extracted evidence
if len(df_extracted) > 0:
    df_validated = pipeline.run_validation(
        bottleneck_id=BOTTLENECK_ID,
        df_extracted=df_extracted,
        save_to_db=False  # Set to True to save to database
    )
    
    print(f"\nValidation complete:")
    print(f"  Evidence validated: {len(df_validated)}")
    print(f"  Validated as relevant: {df_validated['is_valid'].sum()}")
    print(f"  Validation rate: {df_validated['is_valid'].mean()*100:.1f}%")
else:
    print("No evidence to validate")
    df_validated = pd.DataFrame()

In [0]:
# View validation results
if len(df_validated) > 0:
    print("\nValidation summary:")
    print(df_validated[['document_name', 'evidence_text', 'is_valid', 'confidence', 'validation_reasoning']].head())

## Stage 3: Formatting

Format validated evidence for output

In [0]:
# Run formatting on validated evidence
if len(df_validated[df_validated['is_valid']]) > 0:
    df_formatted = pipeline.run_formatting(
        bottleneck_id=BOTTLENECK_ID,
        df_validated=df_validated[df_validated['is_valid']],
        save_to_db=False  # Set to True to save to database
    )
    
    print(f"\nFormatting complete:")
    print(f"  Evidence formatted: {len(df_formatted)}")
else:
    print("No valid evidence to format")
    df_formatted = pd.DataFrame()

In [0]:
# View formatted output
if len(df_formatted) > 0:
    print("\nFormatted evidence summary:")
    for idx, row in df_formatted.head(2).iterrows():
        print(f"\n--- Final Evidence {idx+1} ---")
        print(f"Summary: {row['summary']}")
        print(f"Key Points: {row['key_points']}")
        print(f"Document: {row['document_name']}")
        print(f"Page: {row['page_number']}")

## Full Pipeline Execution

Run all stages sequentially with database persistence

In [0]:
# Run full pipeline for a bottleneck
def run_full_pipeline(bottleneck_id, chunks_df, save_to_db=True):
    """
    Run complete pipeline for a single bottleneck
    """
    print(f"\n{'='*60}")
    print(f"Running full pipeline for bottleneck {bottleneck_id}")
    print(f"{'='*60}")
    
    # Stage 1: Extraction
    print("\n[Stage 1] Extraction...")
    df_extracted = pipeline.run_extraction(
        bottleneck_id=bottleneck_id,
        chunks=chunks_df.to_dict('records'),
        save_to_db=save_to_db
    )
    print(f"  Extracted {len(df_extracted)} pieces of evidence")
    
    if len(df_extracted) == 0:
        print("  No evidence found, stopping pipeline")
        return None, None, None
    
    # Stage 2: Validation
    print("\n[Stage 2] Validation...")
    df_validated = pipeline.run_validation(
        bottleneck_id=bottleneck_id,
        df_extracted=df_extracted,
        save_to_db=save_to_db
    )
    valid_count = df_validated['is_valid'].sum()
    print(f"  Validated {valid_count}/{len(df_validated)} as relevant")
    
    if valid_count == 0:
        print("  No valid evidence, stopping pipeline")
        return df_extracted, df_validated, None
    
    # Stage 3: Formatting
    print("\n[Stage 3] Formatting...")
    df_formatted = pipeline.run_formatting(
        bottleneck_id=bottleneck_id,
        df_validated=df_validated[df_validated['is_valid']],
        save_to_db=save_to_db
    )
    print(f"  Formatted {len(df_formatted)} final evidence pieces")
    
    return df_extracted, df_validated, df_formatted

In [0]:
# Run full pipeline on sample data
FULL_PIPELINE_SAMPLE = 50  # Adjust size as needed
df_full_sample = df_chunks.sample(n=min(FULL_PIPELINE_SAMPLE, len(df_chunks)), random_state=42)

results = run_full_pipeline(
    bottleneck_id=BOTTLENECK_ID,
    chunks_df=df_full_sample,
    save_to_db=False  # Set to True for production
)

df_ext, df_val, df_fmt = results

## Batch Processing Multiple Bottlenecks

Process multiple bottlenecks sequentially

In [0]:
# Process multiple bottlenecks
BOTTLENECKS_TO_PROCESS = ["1.1", "2.1", "3.1", "6.1"]
BATCH_SIZE = 100  # Number of chunks per bottleneck

all_results = {}

for bottleneck_id in BOTTLENECKS_TO_PROCESS:
    print(f"\n\n{'#'*70}")
    print(f"Processing Bottleneck {bottleneck_id}")
    print(f"{'#'*70}")
    
    # Get sample for this bottleneck
    df_batch = df_chunks.sample(n=min(BATCH_SIZE, len(df_chunks)), random_state=42)
    
    # Run pipeline
    results = run_full_pipeline(
        bottleneck_id=bottleneck_id,
        chunks_df=df_batch,
        save_to_db=False
    )
    
    all_results[bottleneck_id] = results
    
print("\n\n" + "="*70)
print("BATCH PROCESSING COMPLETE")
print("="*70)

In [0]:
# Summary of batch results
print("\nBatch Processing Summary:")
print("-"*50)

for bottleneck_id, (df_ext, df_val, df_fmt) in all_results.items():
    print(f"\nBottleneck {bottleneck_id}:")
    if df_ext is not None:
        print(f"  Extracted: {len(df_ext)} evidence pieces")
    if df_val is not None:
        print(f"  Validated: {df_val['is_valid'].sum()}/{len(df_val)} as relevant")
    if df_fmt is not None:
        print(f"  Formatted: {len(df_fmt)} final pieces")

## Export Results to Excel

Export formatted results for review

In [0]:
# Export results to Excel
output_dir = "/Volumes/prd_mega/sboost4/vboost4/Documents/input/Bottleneck/"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

for bottleneck_id, (df_ext, df_val, df_fmt) in all_results.items():
    if df_fmt is not None and len(df_fmt) > 0:
        output_file = f"{output_dir}bottleneck_{bottleneck_id}_{timestamp}.xlsx"
        
        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
            df_fmt.to_excel(writer, sheet_name='Final Evidence', index=False)
            df_val.to_excel(writer, sheet_name='Validation Details', index=False)
            df_ext.to_excel(writer, sheet_name='Extraction Details', index=False)
        
        print(f"Exported bottleneck {bottleneck_id} results to: {output_file}")

## Monitoring and Analysis

In [0]:
# Analyze pipeline performance
def analyze_pipeline_performance(results_dict):
    """
    Analyze performance metrics across bottlenecks
    """
    metrics = []
    
    for bottleneck_id, (df_ext, df_val, df_fmt) in results_dict.items():
        if df_ext is not None:
            extraction_rate = len(df_ext) / BATCH_SIZE if BATCH_SIZE > 0 else 0
            validation_rate = df_val['is_valid'].mean() if df_val is not None and len(df_val) > 0 else 0
            final_rate = len(df_fmt) / BATCH_SIZE if df_fmt is not None and BATCH_SIZE > 0 else 0
            
            metrics.append({
                'bottleneck_id': bottleneck_id,
                'extraction_rate': extraction_rate,
                'validation_rate': validation_rate,
                'final_rate': final_rate,
                'extracted_count': len(df_ext) if df_ext is not None else 0,
                'validated_count': df_val['is_valid'].sum() if df_val is not None else 0,
                'final_count': len(df_fmt) if df_fmt is not None else 0
            })
    
    return pd.DataFrame(metrics)

# Generate performance report
df_performance = analyze_pipeline_performance(all_results)
print("Pipeline Performance Analysis:")
print(df_performance.to_string())

In [0]:
# Visualize performance (if matplotlib is available)
try:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Extraction rates
    axes[0].bar(df_performance['bottleneck_id'], df_performance['extraction_rate'])
    axes[0].set_title('Extraction Rate by Bottleneck')
    axes[0].set_ylabel('Rate')
    axes[0].set_ylim(0, 1)
    
    # Validation rates
    axes[1].bar(df_performance['bottleneck_id'], df_performance['validation_rate'])
    axes[1].set_title('Validation Rate by Bottleneck')
    axes[1].set_ylabel('Rate')
    axes[1].set_ylim(0, 1)
    
    # Final counts
    axes[2].bar(df_performance['bottleneck_id'], df_performance['final_count'])
    axes[2].set_title('Final Evidence Count by Bottleneck')
    axes[2].set_ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
except ImportError:
    print("Matplotlib not available for visualization")

## Clean Up

In [0]:
# Clean up resources
spark.stop()
print("Pipeline execution complete. Spark session closed.")