# Stage 1 Test: Candidate Matching

This notebook tests the fuzzy matching candidate finder in isolation.

**Purpose**: Validate that fuzzy matching correctly identifies potential parent companies.


In [None]:
# Setup
import sys
sys.path.append('/Workspace/Repos/phmsa-company-hierarchy/')

from phmsa_hierarchy import ParentCandidateFinder

# Sample PHMSA companies for testing
test_companies = [
    "ENBRIDGE",
    "ENBRIDGE ENERGY, LIMITED PARTNERSHIP",
    "ENBRIDGE ENERGY PARTNERS LP",
    "WILLIAMS",
    "WILLIAMS FIELD SERVICES - GULF COAST COMPANY, LP",
    "WILLIAMS PIPELINE COMPANY",
    "EXXONMOBIL PIPELINE COMPANY LLC",
    "EXXONMOBIL OIL CORPORATION",
    "EXXONMOBIL PRODUCTION COMPANY, A DIVISION OF EXXON MOBIL CORPORATION",
    "PLAINS PIPELINE, L.P.",
    "CHEVRON PIPE LINE CO",
    "CHEVRON PRODUCTS COMPANY"
]

candidate_finder = ParentCandidateFinder(test_companies)
print(f"✓ Loaded {len(test_companies)} test companies")


In [None]:
# Test 1: Name Containment
print("=== Test 1: Name Containment ===")
print("Company: ENBRIDGE ENERGY, LIMITED PARTNERSHIP")
print("Expected: Should find 'ENBRIDGE' as candidate\n")

candidates = candidate_finder.find_candidates("ENBRIDGE ENERGY, LIMITED PARTNERSHIP")

for i, candidate in enumerate(candidates, 1):
    print(f"{i}. {candidate['name']}")
    print(f"   Confidence: {candidate['confidence']}")
    print(f"   Reason: {candidate['reason']}")
    print(f"   Match Type: {candidate['match_type']}\n")

# Validation
assert len(candidates) > 0, "Should find at least one candidate"
assert any(c['name'] == "ENBRIDGE" for c in candidates), "Should find ENBRIDGE as parent"
print("✓ Test passed!")


In [None]:
# Test 2: Multiple Subsidiaries
print("=== Test 2: Multiple Subsidiaries ===")
print("Testing multiple Williams subsidiaries\n")

williams_subsidiaries = [
    "WILLIAMS FIELD SERVICES - GULF COAST COMPANY, LP",
    "WILLIAMS PIPELINE COMPANY"
]

for subsidiary in williams_subsidiaries:
    print(f"Testing: {subsidiary}")
    candidates = candidate_finder.find_candidates(subsidiary)
    
    if candidates:
        top_candidate = candidates[0]
        print(f"  Top candidate: {top_candidate['name']} ({top_candidate['confidence']})")
    else:
        print(f"  No candidates found")
    
    print()

print("✓ Test completed")


In [None]:
# Test 3: Statistics
print("=== Test 3: Matching Statistics ===\n")

test_company = "EXXONMOBIL PIPELINE COMPANY LLC"
stats = candidate_finder.get_statistics(test_company)

print(f"Company: {test_company}")
print(f"Total candidates: {stats['total_candidates']}")
print(f"Match types: {stats['match_types']}")
print(f"Average confidence: {stats['avg_confidence']:.2f}")

if stats['top_candidate']:
    print(f"\nTop candidate: {stats['top_candidate']['name']}")
    print(f"Confidence: {stats['top_candidate']['confidence']}")

print("\n✓ Statistics generated successfully")
