# Complete End-to-End Example

This notebook demonstrates the complete hybrid pipeline with sample data.

**Purpose**: Show how all three stages work together to identify company hierarchies.


In [None]:
# Setup
import sys
sys.path.append('/Workspace/Repos/phmsa-company-hierarchy/')

import pandas as pd
from phmsa_hierarchy import ParentCandidateFinder, LLMValidator, HierarchyGraphBuilder

# For demonstration, we'll mock the LLM to avoid API calls
class MockLLM:
    def invoke(self, prompt):
        class Response:
            content = '{"parent": "ENBRIDGE", "confidence": 9, "reasoning": "Web search confirms this is a subsidiary"}'
        return Response()

class MockSearch:
    def run(self, query):
        return "Mock search results..."

# Initialize components
llm = MockLLM()
search_tool = MockSearch()

candidate_finder = ParentCandidateFinder()
llm_validator = LLMValidator(llm, search_tool)
graph_builder = HierarchyGraphBuilder()

print("✓ All components initialized")


In [None]:
# Sample PHMSA Data
print("=== Sample PHMSA Data ===\n")

# Realistic PHMSA company hierarchy
companies = [
    {"id": 1, "name": "ENBRIDGE"},
    {"id": 2, "name": "ENBRIDGE ENERGY, LIMITED PARTNERSHIP"},
    {"id": 3, "name": "ENBRIDGE ENERGY PARTNERS LP"},
    {"id": 4, "name": "WILLIAMS"},
    {"id": 5, "name": "WILLIAMS FIELD SERVICES COMPANY"},
    {"id": 6, "name": "WILLIAMS PIPELINE COMPANY"},
    {"id": 7, "name": "COLONIAL PIPELINE COMPANY"},
    {"id": 8, "name": "PLAINS PIPELINE, L.P."},
    {"id": 9, "name": "PLAINS ALL AMERICAN PIPELINE, L.P."},
    {"id": 10, "name": "PLAINS"},
]

all_company_names = [c["name"] for c in companies]
candidate_finder.set_companies(all_company_names)

print(f"Loaded {len(companies)} companies:")
for company in companies:
    print(f"  {company['id']:2d}. {company['name']}")

print(f"\n✓ Data loaded")


In [None]:
# Stage 1: Find Candidates for Each Company
print("=== STAGE 1: Fuzzy Candidate Matching ===\n")

stage1_results = []

for company in companies:
    candidates = candidate_finder.find_candidates(company["name"])
    stage1_results.append({
        "company": company["name"],
        "num_candidates": len(candidates),
        "candidates": candidates
    })
    
    print(f"{company['name']}")
    if candidates:
        for i, candidate in enumerate(candidates[:3], 1):  # Show top 3
            print(f"  {i}. {candidate['name']} (conf: {candidate['confidence']}, type: {candidate['match_type']})")
    else:
        print(f"  No candidates found → Likely ultimate parent")
    print()

print(f"✓ Stage 1 complete - found candidates for {len([r for r in stage1_results if r['num_candidates'] > 0])} companies")


In [None]:
# Stage 2: Manually set parent relationships for demonstration
# (In production, this would use LLM validation)
print("=== STAGE 2: Parent Validation (Mock) ===\n")

# Manually define known relationships for this demo
parent_mappings = [
    {"child": "ENBRIDGE ENERGY, LIMITED PARTNERSHIP", "parent": "ENBRIDGE"},
    {"child": "ENBRIDGE ENERGY PARTNERS LP", "parent": "ENBRIDGE"},
    {"child": "ENBRIDGE", "parent": "ULTIMATE"},
    {"child": "WILLIAMS FIELD SERVICES COMPANY", "parent": "WILLIAMS"},
    {"child": "WILLIAMS PIPELINE COMPANY", "parent": "WILLIAMS"},
    {"child": "WILLIAMS", "parent": "ULTIMATE"},
    {"child": "COLONIAL PIPELINE COMPANY", "parent": "ULTIMATE"},
    {"child": "PLAINS PIPELINE, L.P.", "parent": "PLAINS ALL AMERICAN PIPELINE, L.P."},
    {"child": "PLAINS ALL AMERICAN PIPELINE, L.P.", "parent": "PLAINS"},
    {"child": "PLAINS", "parent": "ULTIMATE"},
]

parent_mappings_df = pd.DataFrame(parent_mappings)

print("Parent relationships identified:")
for _, row in parent_mappings_df.iterrows():
    print(f"  {row['child']}")
    print(f"    → {row['parent']}")
    print()

print(f"✓ Stage 2 complete - {len(parent_mappings)} relationships identified")


In [None]:
# Stage 3: Build Hierarchy Graph
print("=== STAGE 3: Hierarchy Graph Resolution ===\n")

hierarchy_df = graph_builder.build(parent_mappings_df)

print("Complete Hierarchy:")
print(hierarchy_df[['company', 'ultimate_parent', 'hierarchy_path', 'hierarchy_depth']].to_string(index=False))

print(f"\n✓ Stage 3 complete - computed ultimate parents for {len(hierarchy_df)} companies")


In [None]:
# Analysis: Ultimate Parents and Their Subsidiaries
print("=== ANALYSIS: Corporate Families ===\n")

# Group by ultimate parent
families = hierarchy_df.groupby('ultimate_parent')['company'].apply(list).to_dict()

for parent, subsidiaries in families.items():
    print(f"{parent} ({len(subsidiaries)} companies)")
    for subsidiary in subsidiaries:
        if subsidiary != parent:
            path = hierarchy_df[hierarchy_df['company'] == subsidiary]['hierarchy_path'].values[0]
            depth = hierarchy_df[hierarchy_df['company'] == subsidiary]['hierarchy_depth'].values[0]
            print(f"  └─ {subsidiary} (depth: {depth})")
            if depth > 1:
                print(f"     Path: {path}")
    print()

print("✓ Analysis complete")


In [None]:
# Statistics
print("=== STATISTICS ===\n")

stats = graph_builder.get_statistics()

print("Graph Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

print("\nHierarchy Depth Distribution:")
depth_dist = hierarchy_df['hierarchy_depth'].value_counts().sort_index()
for depth, count in depth_dist.items():
    print(f"  Depth {depth}: {count} companies")

print("\n✓ Statistics generated")
print("\n" + "="*50)
print("PIPELINE COMPLETE ✓")
print("="*50)
