# PHMSA Company Hierarchy Analysis - Hybrid Approach

This notebook implements a hybrid approach combining:
1. **Fuzzy Matching**: Find candidate parents within PHMSA dataset
2. **LLM Validation**: Use LLM + web search to validate candidates
3. **Graph Resolution**: Compute ultimate parents from immediate relationships

## Benefits
- **Higher Accuracy**: LLM sees real PHMSA candidates → better decisions
- **Better Performance**: Fewer dead-end searches
- **Extensible**: Easy to add new matching rules or swap LLM providers


In [None]:
# Cell 1: Install dependencies
%pip install -U langgraph langchain-community duckduckgo-search pandas
dbutils.library.restartPython()


In [None]:
# Cell 2: Initialize LLM and Search Tool
import os
import json
from langchain_community.chat_models import ChatDatabricks
from langchain_community.tools import DuckDuckGoSearchResults

# Initialize Databricks-hosted Claude model
llm = ChatDatabricks(
    endpoint="databricks-claude-sonnet-4-5",
    extra_params={"temperature": 0, "max_tokens": 1000}
)

# Initialize DuckDuckGo search
search_tool = DuckDuckGoSearchResults()

print("✓ LLM and search tool initialized")


In [None]:
# Cell 3: Import Hybrid Modules
import sys
sys.path.append('/Workspace/Repos/phmsa-company-hierarchy/')  # Update with your repo path

from phmsa_hierarchy import ParentCandidateFinder, LLMValidator, HierarchyGraphBuilder

# Initialize the three stages
candidate_finder = ParentCandidateFinder()
llm_validator = LLMValidator(llm, search_tool)
graph_builder = HierarchyGraphBuilder()

print("✓ Hybrid modules imported and initialized")


In [None]:
# Cell 4: Load PHMSA Data
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import *

# Load from Unity Catalog
source_table = "gshen_catalog.enbridge_sr_workshop.annual_hazardous_liquid_2024_updated"

# Get unique companies
companies_df = spark.read.table(source_table) \
    .select("OPERATOR_ID", "PARTA2NAMEOFCOMP", "PARTA4STREET", "PARTA4CITY", "PARTA4STATE") \
    .distinct()

# For testing, limit to subset
# companies_df = companies_df.limit(50)

print(f"✓ Loaded {companies_df.count()} unique companies")

# Collect all company names for fuzzy matching
all_company_names = [row.PARTA2NAMEOFCOMP for row in companies_df.collect()]
print(f"✓ Collected {len(all_company_names)} company names for matching")

# Set companies in candidate finder
candidate_finder.set_companies(all_company_names)


In [None]:
# Cell 5: Define Output Schema
schema = StructType([
    StructField("OPERATOR_ID", LongType(), True),
    StructField("ORIGINAL_NAME", StringType(), True),
    StructField("IMMEDIATE_PARENT", StringType(), True),
    StructField("CANDIDATES_FOUND", IntegerType(), True),
    StructField("TOP_CANDIDATE", StringType(), True),
    StructField("CONFIDENCE", IntegerType(), True),
    StructField("REASONING", StringType(), True),
    StructField("ACQUISITION_DATE", StringType(), True),
    StructField("RECENT_CHANGE", BooleanType(), True)
])

print("✓ Schema defined")


In [None]:
# Cell 6: Define Hybrid Processing UDF
@pandas_udf(schema)
def find_parent_hybrid(ids, names, streets, cities, states):
    """
    Hybrid approach: Fuzzy matching + LLM validation.
    
    Stage 1: Find candidate parents using fuzzy matching
    Stage 2: Validate with LLM using web search
    """
    results = []
    
    for op_id, name, street, city, state in zip(ids, names, streets, cities, states):
        try:
            # Stage 1: Find candidates
            candidates = candidate_finder.find_candidates(name)
            
            # Stage 2: Validate with LLM
            address = f"{street}, {city}, {state}"
            parent_info = llm_validator.validate(
                company_name=name,
                candidates=candidates,
                operator_id=op_id,
                address=address
            )
            
            results.append({
                "OPERATOR_ID": op_id,
                "ORIGINAL_NAME": name,
                "IMMEDIATE_PARENT": parent_info["parent"],
                "CANDIDATES_FOUND": len(candidates),
                "TOP_CANDIDATE": candidates[0]["name"] if candidates else None,
                "CONFIDENCE": parent_info["confidence"],
                "REASONING": parent_info["reasoning"],
                "ACQUISITION_DATE": parent_info.get("acquisition_date"),
                "RECENT_CHANGE": parent_info.get("recent_change", False)
            })
            
        except Exception as e:
            # Handle errors gracefully
            results.append({
                "OPERATOR_ID": op_id,
                "ORIGINAL_NAME": name,
                "IMMEDIATE_PARENT": "ERROR",
                "CANDIDATES_FOUND": 0,
                "TOP_CANDIDATE": None,
                "CONFIDENCE": 0,
                "REASONING": f"Processing failed: {str(e)}",
                "ACQUISITION_DATE": None,
                "RECENT_CHANGE": False
            })
    
    return pd.DataFrame(results)

print("✓ Hybrid UDF defined")


In [None]:
# Cell 7: Process Companies (Stages 1 & 2)
print("Starting hybrid processing...")

# Apply hybrid UDF
parent_mappings_df = companies_df.select(
    find_parent_hybrid(
        "OPERATOR_ID", 
        "PARTA2NAMEOFCOMP", 
        "PARTA4STREET", 
        "PARTA4CITY", 
        "PARTA4STATE"
    ).alias("result")
).select("result.*")

# Cache for performance
parent_mappings_df.cache()

print(f"✓ Processed {parent_mappings_df.count()} companies")

# Show sample results
display(parent_mappings_df.limit(10))


In [None]:
# Cell 8: Build Hierarchy Graph (Stage 3)
print("Building hierarchy graph...")

# Convert to pandas for graph building
parent_mappings_pd = parent_mappings_df.toPandas()

# Rename columns for graph builder
parent_mappings_pd = parent_mappings_pd.rename(columns={
    "ORIGINAL_NAME": "child",
    "IMMEDIATE_PARENT": "parent"
})

# Build hierarchy graph
hierarchy_df = graph_builder.build(parent_mappings_pd)

print(f"✓ Built hierarchy graph with {len(hierarchy_df)} companies")

# Show sample hierarchy results
print("\nSample hierarchy paths:")
print(hierarchy_df[['company', 'ultimate_parent', 'hierarchy_path', 'hierarchy_depth']].head(10))


In [None]:
# Cell 9: Get Statistics
print("=== Graph Statistics ===")
stats = graph_builder.get_statistics()
for key, value in stats.items():
    print(f"{key}: {value}")

print("\n=== Confidence Distribution ===")
confidence_dist = parent_mappings_pd['CONFIDENCE'].value_counts().sort_index()
print(confidence_dist)

print("\n=== Candidates Found Distribution ===")
candidates_dist = parent_mappings_pd['CANDIDATES_FOUND'].value_counts().sort_index()
print(candidates_dist)


In [None]:
# Cell 10: Review Recent Ownership Changes
print("=== Recent Ownership Changes (2024+) ===\n")

recent_changes = parent_mappings_pd[parent_mappings_pd['RECENT_CHANGE'] == True]

if len(recent_changes) > 0:
    print(f"Found {len(recent_changes)} companies with recent ownership changes:\n")
    for _, row in recent_changes.iterrows():
        print(f"Company: {row['ORIGINAL_NAME']}")
        print(f"  New Parent: {row['IMMEDIATE_PARENT']}")
        print(f"  Acquisition Year: {row['ACQUISITION_DATE']}")
        print(f"  Reasoning: {row['REASONING']}")
        print()
    
    print("⚠️  RECOMMENDATION: Manually verify these recent changes")
else:
    print("No recent ownership changes detected (2024+)")

print("\n✓ Recent changes review complete")


In [None]:
# Cell 11: Save Results to Unity Catalog
output_table = "gshen_catalog.enbridge_sr_workshop.operator_hierarchy_hybrid"

# Convert hierarchy results back to Spark DataFrame
hierarchy_spark_df = spark.createDataFrame(hierarchy_df)

# Join with original parent mappings to get all fields
final_df = hierarchy_spark_df.join(
    parent_mappings_df,
    hierarchy_spark_df.company == parent_mappings_df.ORIGINAL_NAME,
    "left"
).select(
    "OPERATOR_ID",
    parent_mappings_df.ORIGINAL_NAME,
    "immediate_parent",
    "ultimate_parent",
    "hierarchy_path",
    "hierarchy_depth",
    "has_cycle",
    "CANDIDATES_FOUND",
    "TOP_CANDIDATE",
    "CONFIDENCE",
    "REASONING",
    "ACQUISITION_DATE",
    "RECENT_CHANGE"
)

# Save to Unity Catalog
final_df.write.mode("overwrite").saveAsTable(output_table)

print(f"✓ Results saved to: {output_table}")
print(f"  Total records: {final_df.count()}")

# Display final results
display(spark.table(output_table).limit(20))
