# Universal Billing Code Extraction

**Step 2 of the pipeline:**
1. Crosswalk notebook finds applicable codes → exports results
2. **This notebook** extracts full 190-column profile for each code

**Output format:** Wide table (1 row per code, 190 columns)

**Value System (7 values):**
| Value | Meaning |
|-------|---------|
| `YES` | Explicitly stated as allowed/applicable |
| `NO` | Explicitly stated as prohibited/not applicable |
| `DEFAULT_YES` | Not stated, but logically follows from code type |
| `DEFAULT_NO` | Not stated, but logically incompatible with code type |
| `N/A` | Field category doesn't apply to this service type |
| `CONDITIONAL:{note}` | Depends on circumstances |
| `UNKNOWN` | Genuinely can't determine — needs human review |

**Goal:** Minimize UNKNOWN by using logical inference

## 1. Setup

In [None]:
!pip install openai pandas pdfplumber openpyxl tqdm -q
print('Ready')

## 2. Define Column Structure (190 columns)

In [None]:
# Fixed identifier columns (first 6)
ID_COLUMNS = [
    'billing_code',
    'province',
    'billing_name',
    'base_rate',
    'modality',
    'source_document'
]

# All 184 field columns organized by category
FIELD_COLUMNS = [
    # Core identification
    'core__billing_code_variant',
    'core__fee_schedule_name',
    'core__fee_schedule_version',
    'core__section_code',
    'core__category_code',
    'core__effective_date',
    
    # Code structure
    'structure__code_formula',
    'structure__code_prefix_meaning',
    'structure__code_suffix_meaning',
    'structure__code_section_range',
    'structure__modifier_approach',
    'structure__code_family',
    'structure__in_office_variant',
    'structure__out_of_office_variant',
    'structure__virtual_variant',
    'structure__provincial_code_format',
    
    # Specialty
    'specialty__specialty_code',
    'specialty__specialty_name',
    'specialty__specialty_section',
    'specialty__specialty_subsection',
    'specialty__specialty_is_gp',
    'specialty__specialty_modifier_required',
    'specialty__specialty_rate_tier',
    'specialty__specialty_certification_required',
    
    # Care setting (boolean fields)
    'setting__office',
    'setting__out_of_office',
    'setting__home',
    'setting__hospital_inpatient',
    'setting__hospital_outpatient',
    'setting__emergency_department',
    'setting__urgent_care',
    'setting__nursing_home',
    'setting__long_term_care',
    'setting__telehealth_synchronous',
    'setting__telehealth_asynchronous',
    'setting__virtual_care',
    'setting__ambulatory_surgical_center',
    'setting__community_health_center',
    'setting__correctional_facility',
    'setting__mobile_clinic',
    
    # Jurisdiction
    'jurisdiction__in_province',
    'jurisdiction__out_of_province_canadian',
    'jurisdiction__out_of_country',
    'jurisdiction__reciprocal_billing',
    
    # Age modifiers
    'age__age_0_1',
    'age__age_under_2',
    'age__age_2_12',
    'age__age_under_13',
    'age__age_13_17',
    'age__age_18_49',
    'age__age_50_59',
    'age__age_60_64',
    'age__age_65_69',
    'age__age_70_79',
    'age__age_80_plus',
    'age__age_not_applicable',
    'age__age_band_type',
    'age__age_modifier_rate',
    'age__age_modifier_code',
    
    # Time premiums
    'time_premium__after_hours_evening',
    'time_premium__after_hours_night',
    'time_premium__after_hours_weekend',
    'time_premium__after_hours_holiday',
    'time_premium__emergency_premium',
    'time_premium__on_call_premium',
    
    # Callback
    'callback__callback_evening',
    'callback__callback_night',
    'callback__callback_weekend',
    'callback__callback_holiday',
    'callback__unscheduled_service',
    
    # Telehealth
    'telehealth__telehealth_video',
    'telehealth__telehealth_audio',
    'telehealth__telehealth_store_forward',
    'telehealth__virtual_care_standard',
    'telehealth__virtual_care_comprehensive',
    'telehealth__e_consult',
    
    # Complexity
    'complexity__complexity_low',
    'complexity__complexity_moderate',
    'complexity__complexity_high',
    'complexity__complexity_extended',
    'complexity__acuity_modifier',
    
    # Location modifiers
    'location__hospital_modifier',
    'location__facility_fee',
    'location__rural_modifier',
    'location__remote_modifier',
    'location__northern_modifier',
    
    # Procedure modifiers
    'procedure__bilateral_modifier',
    'procedure__multiple_procedure',
    'procedure__repeat_procedure',
    'procedure__surgical_assist',
    'procedure__second_surgeon',
    'procedure__anesthesia_modifier',
    
    # Bundling
    'bundling__bundling_approach',
    'bundling__bundled_with_codes',
    'bundling__unbundled_components',
    'bundling__technical_component',
    'bundling__professional_component',
    'bundling__global_service',
    'bundling__component_billing_allowed',
    'bundling__bundle_exception_rules',
    
    # Billing rules
    'rule__frequency_limit',
    'rule__frequency_period',
    'rule__referral_required',
    'rule__approval_required',
    'rule__certification_required',
    'rule__written_report_required',
    'rule__same_day_restriction',
    'rule__same_provider_restriction',
    'rule__location_restriction',
    'rule__time_documentation_required',
    'rule__diagnostic_restriction',
    
    # Duration
    'duration__time_unit',
    'duration__minimum_time',
    'duration__maximum_time',
    'duration__time_rounding_rule',
    'duration__concurrent_billing_allowed',
    'duration__cumulative_time_allowed',
    
    # Frequency limits
    'frequency__frequency_per_day',
    'frequency__frequency_per_week',
    'frequency__frequency_per_month',
    'frequency__frequency_per_year',
    'frequency__frequency_per_lifetime',
    'frequency__frequency_per_episode',
    'frequency__frequency_exception_criteria',
    
    # Service type classification
    'service__visit_assessment',
    'service__visit_comprehensive',
    'service__consultation',
    'service__hospital_visit',
    'service__emergency_visit',
    'service__home_visit',
    'service__diagnostic_test',
    'service__diagnostic_imaging',
    'service__laboratory',
    'service__surgical_procedure',
    'service__therapeutic_procedure',
    'service__counselling',
    'service__preventive_care',
    'service__chronic_disease_management',
    
    # Complexity level
    'complexity_level__brief',
    'complexity_level__limited',
    'complexity_level__intermediate',
    'complexity_level__comprehensive',
    'complexity_level__complex',
    'complexity_level__extended',
    'complexity_level__prolonged',
    
    # Referral status
    'referral__referred',
    'referral__non_referred',
    'referral__self_referred',
    'referral__referral_not_required',
    'referral__internal_referral',
    'referral__external_referral',
    
    # Rate calculation
    'rate__base_rate',
    'rate__rate_currency',
    'rate__rate_type',
    'rate__rate_unit',
    'rate__rate_base_for_percentage',
    'rate__rate_cap',
    'rate__rate_floor',
    'rate__max_with_specialty',
    'rate__max_with_time_premium',
    'rate__max_with_all_modifiers',
    'rate__pediatric_rate',
    'rate__geriatric_rate',
    'rate__rural_rate',
    'rate__rate_effective_date',
    'rate__rate_expiry_date',
    'rate__rate_previous',
    'rate__rate_change_percent',
    
    # Metadata
    'links_to_primary',
    'source_pages',
    'source_section',
    'extraction_notes'
]

ALL_COLUMNS = ID_COLUMNS + FIELD_COLUMNS
print(f"Total columns: {len(ALL_COLUMNS)}")
print(f"  - ID columns: {len(ID_COLUMNS)}")
print(f"  - Field columns: {len(FIELD_COLUMNS)}")

## 3. Upload Files

In [None]:
from google.colab import files
import pandas as pd

print("Upload the following files:")
print("  1. Crosswalk results (xlsx from Step 1)")
print("  2. Ontario PDF (Schedule of Benefits)")
print("  3. Ontario fees file (.001)")
print()
print("Select ALL files (Ctrl+click):")
uploaded = files.upload()

CROSSWALK_FILE = ONTARIO_PDF = ONTARIO_FEES = None
for f in uploaded.keys():
    if 'crosswalk' in f.lower() and f.endswith('.xlsx'):
        CROSSWALK_FILE = f
    elif f.endswith('.pdf'):
        ONTARIO_PDF = f
    elif '001' in f:
        ONTARIO_FEES = f

print(f"\nCrosswalk: {CROSSWALK_FILE}")
print(f"PDF: {ONTARIO_PDF}")
print(f"Fees: {ONTARIO_FEES}")

# Load crosswalk results
crosswalk_df = pd.read_excel(CROSSWALK_FILE)
print(f"\nCrosswalk codes to process: {len(crosswalk_df)}")
print(crosswalk_df[['ON_Code', 'ON_Description', 'Type']].head(10))

## 4. API Key

In [None]:
OPENAI_API_KEY = ""  # <-- Paste your key

if not OPENAI_API_KEY:
    from getpass import getpass
    OPENAI_API_KEY = getpass("API Key: ")

from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)
print("API ready")

## 5. Load PDF

In [None]:
import pdfplumber
from tqdm.notebook import tqdm

print("Loading PDF...")
pdf_pages = {}
with pdfplumber.open(ONTARIO_PDF) as pdf:
    for i, page in enumerate(tqdm(pdf.pages)):
        try:
            text = page.extract_text()
            if text:
                pdf_pages[i + 1] = text
        except:
            pass

print(f"PDF pages loaded: {len(pdf_pages)}")

## 6. Extraction Function

In [None]:
import json
import re

total_cost = 0.0
total_calls = 0

def track_cost(inp, out):
    global total_cost, total_calls
    total_cost += (inp/1e6)*3.0 + (out/1e6)*15.0
    total_calls += 1

def find_code_pages(code, pages_hint=None):
    """Find PDF pages containing this code."""
    found_pages = []
    
    if pages_hint:
        try:
            parts = str(pages_hint).split('-')
            start = max(1, int(parts[0]) - 5)
            end = min(max(pdf_pages.keys()), int(parts[-1]) + 5)
            for p in range(start, end + 1):
                if p in pdf_pages and code in pdf_pages[p]:
                    found_pages.append(p)
        except:
            pass
    
    if not found_pages:
        for p, text in pdf_pages.items():
            if code in text:
                found_pages.append(p)
    
    return sorted(found_pages)[:10]

# Common value rules for all calls
VALUE_RULES = """
VALUE ASSIGNMENT RULES (use exactly these values):

1. YES = Schedule EXPLICITLY states this is allowed/applicable
2. NO = Schedule EXPLICITLY prohibits or excludes this
3. DEFAULT_YES = Not stated, but LOGICALLY FOLLOWS from code type
   - Example: jurisdiction__in_province = DEFAULT_YES for any OHIP code
4. DEFAULT_NO = Not stated, but LOGICALLY INCOMPATIBLE with code type
   - Example: setting__office = DEFAULT_NO for a virtual-only code
5. N/A = Field category is IRRELEVANT to this service type
   - Example: procedure__bilateral_modifier = N/A for a phone consultation
6. CONDITIONAL:{reason} = Depends on specific circumstances
7. UNKNOWN = ONLY use when genuinely cannot determine from text AND no logical inference applies

GOAL: MINIMIZE UNKNOWN. Use logical inference (DEFAULT_YES, DEFAULT_NO, N/A) whenever possible.
Reserve UNKNOWN only for genuinely ambiguous cases that need human review.
"""

# Define the 5 focused extraction calls with decision logic
EXTRACTION_CALLS = [
    {
        "name": "Core + Structure + Specialty",
        "fields": [
            "core__billing_code_variant", "core__fee_schedule_name", "core__fee_schedule_version",
            "core__section_code", "core__category_code", "core__effective_date",
            "structure__code_formula", "structure__code_prefix_meaning", "structure__code_suffix_meaning",
            "structure__code_section_range", "structure__modifier_approach", "structure__code_family",
            "structure__in_office_variant", "structure__out_of_office_variant", "structure__virtual_variant",
            "structure__provincial_code_format",
            "specialty__specialty_code", "specialty__specialty_name", "specialty__specialty_section",
            "specialty__specialty_subsection", "specialty__specialty_is_gp", "specialty__specialty_modifier_required",
            "specialty__specialty_rate_tier", "specialty__specialty_certification_required"
        ],
        "decision_logic": """
DECISION LOGIC FOR THIS CATEGORY:

CORE: Extract actual values from schedule text.
- fee_schedule_name: Always "Ontario Schedule of Benefits"
- fee_schedule_version: Look for date/version in header

STRUCTURE:
- code_formula: Infer pattern (e.g., "A###" for A-prefix codes)
- code_prefix_meaning: A=Assessment, K=Special services, etc.
- virtual_variant: YES if code is for telehealth/video/phone, else DEFAULT_NO
- in_office_variant: Related office code if exists, else N/A

SPECIALTY:
- specialty_is_gp: YES if general practice, NO if specialist-specific
- specialty_certification_required: YES only if explicitly stated
"""
    },
    {
        "name": "Settings + Telehealth + Jurisdiction",
        "fields": [
            "setting__office", "setting__out_of_office", "setting__home",
            "setting__hospital_inpatient", "setting__hospital_outpatient", "setting__emergency_department",
            "setting__urgent_care", "setting__nursing_home", "setting__long_term_care",
            "setting__telehealth_synchronous", "setting__telehealth_asynchronous", "setting__virtual_care",
            "setting__ambulatory_surgical_center", "setting__community_health_center",
            "setting__correctional_facility", "setting__mobile_clinic",
            "telehealth__telehealth_video", "telehealth__telehealth_audio", "telehealth__telehealth_store_forward",
            "telehealth__virtual_care_standard", "telehealth__virtual_care_comprehensive", "telehealth__e_consult",
            "jurisdiction__in_province", "jurisdiction__out_of_province_canadian",
            "jurisdiction__out_of_country", "jurisdiction__reciprocal_billing"
        ],
        "decision_logic": """
DECISION LOGIC FOR THIS CATEGORY:

SETTINGS - Based on code type:
- If code is VIRTUAL/TELEHEALTH (K08x, telemedicine codes):
  → setting__office = NO (cannot bill virtual code for in-person)
  → setting__telehealth_synchronous = YES
  → setting__virtual_care = YES
  → setting__hospital_inpatient = CONDITIONAL:may apply if patient is inpatient
  
- If code is OFFICE visit (A00x):
  → setting__office = YES
  → setting__telehealth_synchronous = DEFAULT_NO
  
- If NOT explicitly restricted:
  → Use DEFAULT_YES for logical settings
  → Use DEFAULT_NO for incompatible settings

TELEHEALTH - Based on code description:
- If "video" in description → telehealth__telehealth_video = YES, telehealth__telehealth_audio = NO
- If "telephone" in description → telehealth__telehealth_audio = YES, telehealth__telehealth_video = NO
- If "video or telephone" → both YES
- e_consult = YES only for specific e-consult codes (K738-K741)

JURISDICTION:
- jurisdiction__in_province = DEFAULT_YES (all OHIP codes require Ontario residency)
- jurisdiction__out_of_province = DEFAULT_NO unless explicitly allowed
- jurisdiction__reciprocal_billing = check if code eligible for interprovincial
"""
    },
    {
        "name": "Rules + Restrictions + Bundling",
        "fields": [
            "rule__frequency_limit", "rule__frequency_period", "rule__referral_required",
            "rule__approval_required", "rule__certification_required", "rule__written_report_required",
            "rule__same_day_restriction", "rule__same_provider_restriction", "rule__location_restriction",
            "rule__time_documentation_required", "rule__diagnostic_restriction",
            "bundling__bundling_approach", "bundling__bundled_with_codes", "bundling__unbundled_components",
            "bundling__technical_component", "bundling__professional_component", "bundling__global_service",
            "bundling__component_billing_allowed", "bundling__bundle_exception_rules"
        ],
        "decision_logic": """
DECISION LOGIC FOR THIS CATEGORY:

RULES:
- rule__referral_required: YES only if explicitly required, else DEFAULT_NO for GP codes
- rule__same_day_restriction: Check "not payable same day as..." text, use CONDITIONAL:{list codes}
- rule__time_documentation_required: YES if schedule mentions documenting time
- rule__approval_required: YES only if prior authorization explicitly needed

BUNDLING - Based on code type:
- If code is a CONSULTATION/VISIT:
  → bundling__technical_component = N/A
  → bundling__professional_component = N/A
  → bundling__global_service = N/A
  
- If code is a DIAGNOSTIC/PROCEDURE:
  → Check for TC/PC split
  → bundling__bundled_with_codes = list any included services
"""
    },
    {
        "name": "Duration + Rates + Frequency",
        "fields": [
            "duration__time_unit", "duration__minimum_time", "duration__maximum_time",
            "duration__time_rounding_rule", "duration__concurrent_billing_allowed", "duration__cumulative_time_allowed",
            "frequency__frequency_per_day", "frequency__frequency_per_week", "frequency__frequency_per_month",
            "frequency__frequency_per_year", "frequency__frequency_per_lifetime", "frequency__frequency_per_episode",
            "frequency__frequency_exception_criteria",
            "rate__base_rate", "rate__rate_currency", "rate__rate_type", "rate__rate_unit",
            "rate__rate_base_for_percentage", "rate__rate_cap", "rate__rate_floor",
            "rate__max_with_specialty", "rate__max_with_time_premium", "rate__max_with_all_modifiers",
            "rate__pediatric_rate", "rate__geriatric_rate", "rate__rural_rate",
            "rate__rate_effective_date", "rate__rate_expiry_date", "rate__rate_previous", "rate__rate_change_percent"
        ],
        "decision_logic": """
DECISION LOGIC FOR THIS CATEGORY:

DURATION:
- If schedule specifies minimum time → use actual value
- If time-based code (per 15 min, per hour) → duration__time_unit = the unit
- If flat fee, no time requirement → duration fields = N/A

FREQUENCY:
- If no frequency limit stated → DEFAULT_YES for frequency__frequency_per_day = "no limit"
- If specific limit stated → use actual value
- frequency__frequency_per_lifetime typically = N/A for visits

RATES:
- rate__base_rate = actual fee from schedule
- rate__rate_currency = "CAD" always
- rate__rate_type = "flat" for fixed fees, "time-based" for per-unit, "percentage" if % of another code
- rate__rate_unit = "per service", "per 15 minutes", "per hour", etc.
- If no specialty/age/rural modifier mentioned → those rate fields = N/A
"""
    },
    {
        "name": "Service Type + Modifiers + Referral",
        "fields": [
            "service__visit_assessment", "service__visit_comprehensive", "service__consultation",
            "service__hospital_visit", "service__emergency_visit", "service__home_visit",
            "service__diagnostic_test", "service__diagnostic_imaging", "service__laboratory",
            "service__surgical_procedure", "service__therapeutic_procedure", "service__counselling",
            "service__preventive_care", "service__chronic_disease_management",
            "complexity_level__brief", "complexity_level__limited", "complexity_level__intermediate",
            "complexity_level__comprehensive", "complexity_level__complex", "complexity_level__extended",
            "complexity_level__prolonged",
            "referral__referred", "referral__non_referred", "referral__self_referred",
            "referral__referral_not_required", "referral__internal_referral", "referral__external_referral",
            "age__age_0_1", "age__age_under_2", "age__age_2_12", "age__age_under_13",
            "age__age_13_17", "age__age_18_49", "age__age_50_59", "age__age_60_64",
            "age__age_65_69", "age__age_70_79", "age__age_80_plus", "age__age_not_applicable",
            "age__age_band_type", "age__age_modifier_rate", "age__age_modifier_code",
            "time_premium__after_hours_evening", "time_premium__after_hours_night",
            "time_premium__after_hours_weekend", "time_premium__after_hours_holiday",
            "time_premium__emergency_premium", "time_premium__on_call_premium",
            "callback__callback_evening", "callback__callback_night", "callback__callback_weekend",
            "callback__callback_holiday", "callback__unscheduled_service",
            "complexity__complexity_low", "complexity__complexity_moderate",
            "complexity__complexity_high", "complexity__complexity_extended", "complexity__acuity_modifier",
            "location__hospital_modifier", "location__facility_fee", "location__rural_modifier",
            "location__remote_modifier", "location__northern_modifier",
            "procedure__bilateral_modifier", "procedure__multiple_procedure", "procedure__repeat_procedure",
            "procedure__surgical_assist", "procedure__second_surgeon", "procedure__anesthesia_modifier"
        ],
        "decision_logic": """
DECISION LOGIC FOR THIS CATEGORY:

SERVICE TYPE - Mark YES for matching type, NO for others:
- Virtual consultations → service__consultation = YES, service__visit_assessment = YES
- Procedures → service__surgical_procedure or service__therapeutic_procedure = YES
- Most visit codes are NOT: diagnostic_test, diagnostic_imaging, laboratory, surgical_procedure

COMPLEXITY LEVEL - Based on code description:
- "Limited" → complexity_level__limited = YES, others = NO
- "Comprehensive" → complexity_level__comprehensive = YES
- If no complexity in name → all complexity_level fields = N/A

REFERRAL:
- GP codes typically → referral__referral_not_required = DEFAULT_YES
- Specialist consultation → referral__referred = YES or check schedule

AGE MODIFIERS:
- If NO age-specific rates mentioned → age__age_not_applicable = YES, all individual age bands = N/A
- If specific age premiums listed → mark those YES, others NO

TIME/CALLBACK PREMIUMS:
- If code is eligible for after-hours premium → mark applicable ones YES
- If code explicitly excluded from premiums → mark NO
- If not mentioned and code type typically eligible → DEFAULT_YES

PROCEDURE MODIFIERS - For non-procedures:
- All procedure__ fields = N/A for consultations/visits
- Only mark YES/NO for actual procedural codes

LOCATION MODIFIERS:
- Check if code eligible for rural/remote/hospital premiums
- If not a code type that gets location modifiers → N/A
"""
    }
]

def make_focused_call(code, description, fee, context, call_config):
    """Make a single focused extraction call with decision logic."""
    
    fields_list = "\n".join([f'  "{f}": "value",' for f in call_config["fields"]])
    
    prompt = f"""Extract billing information for Ontario code {code}.

CODE: {code}
DESCRIPTION: {description}
FEE: ${fee}

SCHEDULE CONTENT:
{context}

{VALUE_RULES}

{call_config["decision_logic"]}

Return JSON with these fields:
{{
{fields_list}
}}

REMEMBER: Minimize UNKNOWN. Use DEFAULT_YES, DEFAULT_NO, or N/A for logical inferences."""

    try:
        resp = client.chat.completions.create(
            model="gpt-5.1-2025-11-13",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_completion_tokens=1500
        )
        track_cost(resp.usage.prompt_tokens, resp.usage.completion_tokens)
        
        content = resp.choices[0].message.content
        match = re.search(r'\{[\s\S]*\}', content)
        
        if match:
            return json.loads(match.group())
    except Exception as e:
        print(f"    Error in {call_config['name']}: {e}")
    
    return {}

def extract_code_profile(code, description, fee, modality, code_type, pages_hint, links_to):
    """Extract full profile using 5 focused calls with decision logic."""
    
    code_pages = find_code_pages(code, pages_hint)
    
    row = {
        'billing_code': code,
        'province': 'ON',
        'billing_name': description,
        'base_rate': fee,
        'modality': modality,
        'source_document': 'Ontario Schedule of Benefits 2024'
    }
    
    for col in FIELD_COLUMNS:
        row[col] = 'UNKNOWN'
    
    if not code_pages:
        print(f"    Warning: No pages found")
        row['extraction_notes'] = 'INSUFFICIENT_DATA: No pages found for code'
        row['links_to_primary'] = links_to
        return row
    
    context = "\n".join([f"=== PAGE {p} ===\n{pdf_pages[p]}" for p in code_pages])
    
    for i, call_config in enumerate(EXTRACTION_CALLS):
        print(f"    Call {i+1}/5: {call_config['name']}")
        
        extracted = make_focused_call(code, description, fee, context, call_config)
        
        for field, value in extracted.items():
            if field in row:
                row[field] = value
    
    row['links_to_primary'] = links_to
    row['source_pages'] = str(code_pages)
    
    return row

print("Extraction ready (5 focused calls with decision logic)")
print(f"Fields per call: {[len(c['fields']) for c in EXTRACTION_CALLS]}")

## 7. RUN EXTRACTION

In [None]:
print("="*70)
print("EXTRACTING FULL PROFILES (5 calls per code)")
print("="*70)
print(f"Codes to process: {len(crosswalk_df)}")
print(f"Expected API calls: {len(crosswalk_df) * 5}")
print()

all_rows = []

for i, row in tqdm(crosswalk_df.iterrows(), total=len(crosswalk_df), desc="Extracting"):
    code = row.get('ON_Code', '')
    desc = row.get('ON_Description', '')
    fee = row.get('ON_Fee', 0)
    modality = row.get('Modality', 'both')
    code_type = row.get('Type', 'PRIMARY')
    pages_hint = str(row.get('Pages', ''))
    links_to = row.get('Links_To', '') if code_type == 'ADD-ON' else ''
    
    print(f"\n[{i+1}/{len(crosswalk_df)}] {code} - {desc[:40]}...")
    
    profile = extract_code_profile(
        code=code,
        description=desc,
        fee=fee,
        modality=modality,
        code_type=code_type,
        pages_hint=pages_hint,
        links_to=links_to
    )
    
    all_rows.append(profile)

print("\n" + "="*70)
print("EXTRACTION COMPLETE")
print("="*70)
print(f"Total rows: {len(all_rows)}")
print(f"Total API calls: {total_calls}")
print(f"Total cost: ${total_cost:.2f}")
print("="*70)

## 8. Save Results

In [None]:
# Create DataFrame with all columns in order
result_df = pd.DataFrame(all_rows, columns=ALL_COLUMNS)

# Save to Excel
output_file = 'universal_billing_extraction.xlsx'
result_df.to_excel(output_file, index=False)
print(f"Saved {len(result_df)} rows x {len(result_df.columns)} columns to {output_file}")

from google.colab import files
files.download(output_file)

## 9. Preview

In [None]:
# Show first few columns for preview
preview_cols = ['billing_code', 'billing_name', 'base_rate', 'modality',
                'setting__telehealth_synchronous', 'telehealth__telehealth_video',
                'rule__same_day_restriction', 'duration__minimum_time', 'links_to_primary']

result_df[preview_cols]

## 10. Summary Statistics

In [None]:
print("EXTRACTION SUMMARY")
print("="*50)
print(f"Total codes extracted: {len(result_df)}")
print(f"Total columns: {len(result_df.columns)}")
print()

# Count UNKNOWN values per category
categories = ['setting__', 'telehealth__', 'rule__', 'duration__', 'rate__']
for cat in categories:
    cat_cols = [c for c in result_df.columns if c.startswith(cat)]
    unknown_count = (result_df[cat_cols] == 'UNKNOWN').sum().sum()
    total_cells = len(cat_cols) * len(result_df)
    pct = (unknown_count / total_cells * 100) if total_cells > 0 else 0
    print(f"{cat}: {unknown_count}/{total_cells} UNKNOWN ({pct:.1f}%)")