### LA WCFS ETL

In [2]:
import re
import pandas as pd
from PyPDF2 import PdfReader

# Load PDF
reader = PdfReader("LA_export.pdf")
text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

# Regex to find radiology-like CPT entries
# Looks for: 5-digit CPT codes starting with 7, followed by description and rate or "B.R."
pattern = r'(?P<code>7\d{4})\s+(?P<description>.+?)\s{2,}(?P<rate>\$?[\d,.]+|B\.R\.)'

# Extract matches
matches = re.findall(pattern, text)

# Clean and format
data = []
for code, desc, rate in matches:
    is_br = rate.strip().upper() == 'B.R.'
    rate_value = None if is_br else float(rate.replace('$', '').replace(',', ''))
    data.append({
        'proc_cd': code.strip(),
        'modifier': None,
        'description': desc.strip(),
        'rate': rate_value,
        'rate_unit': 1,
        'is_by_report': is_br,
        'region_type': 'state',
        'region_value': 'LA'
    })

# Convert to DataFrame
la_radiology_df = pd.DataFrame(data)

# Clean modifier from beginning of description
def extract_modifier(row):
    parts = row['description'].strip().split(maxsplit=1)
    if parts[0] in ['26', 'TC'] and len(parts) > 1:
        return pd.Series({'modifier': parts[0], 'description': parts[1]})
    else:
        return pd.Series({'modifier': row['modifier'], 'description': row['description']})

# Apply the logic
la_radiology_df[['modifier', 'description']] = la_radiology_df.apply(extract_modifier, axis=1)


# Save as CSV
output_path = "db_import_louisiana_radiology.csv"
la_radiology_df.to_csv(output_path, index=False)

# Preview
la_radiology_df.head()


Unnamed: 0,proc_cd,modifier,description,rate,rate_unit,is_by_report,region_type,region_value
0,70010,,Contrast X -ray of brain,438.0,1,False,state,LA
1,70010,26,Contrast X -ray of brain,127.0,1,False,state,LA
2,70010,TC,Contrast X -ray of brain,311.0,1,False,state,LA
3,70015,,Contrast X -ray of brain,224.0,1,False,state,LA
4,70015,26,Contrast X -ray of brain,127.0,1,False,state,LA
