### AL WCFS ETL

In [1]:
import re
import pandas as pd

# Load PDF text
from PyPDF2 import PdfReader
reader = PdfReader("2024PhysiciansFeeSchedule.pdf")
full_text = "\n".join(page.extract_text() for page in reader.pages)



-- regex to match columns

In [None]:
# Extract using regex
matches = re.findall(r"CPT CODE:\s*\$(\d[\d,\.]*)\s*([A-Z0-9\-]+)", full_text)

# Clean and format
data = []
for fee, code in matches:
    fee_clean = float(fee.replace(",", ""))
    code_clean = code.strip()
    data.append((code_clean, fee_clean))

df_alabama = pd.DataFrame(data, columns=['proc_cd', 'rate'])

# Add standardized columns
df_alabama['modifier'] = None
df_alabama['description'] = None
df_alabama['rate_unit'] = 1
df_alabama['is_by_report'] = False
df_alabama['region_type'] = 'state'
df_alabama['region_value'] = 'AL'

# Final column order
final_cols = ['proc_cd', 'modifier', 'description', 'rate', 'rate_unit', 'is_by_report', 'region_type', 'region_value']
df_alabama = df_alabama[final_cols]


Unnamed: 0,proc_cd,modifier,description,rate,rate_unit,is_by_report,region_type,region_value
0,0001U-8,,,715.61,1,False,state,AL
1,0002U-8,,,116.94,1,False,state,AL
2,0003U-8,,,127.01,1,False,state,AL
3,0004U-8,,,47.86,1,False,state,AL
4,0005U-8,,,3106.67,1,False,state,AL


--radiology

In [None]:
# Recreate df_alabama from the original PDF (if it's not defined)
import re
import pandas as pd
from PyPDF2 import PdfReader

# Read PDF and extract all text
reader = PdfReader("2024PhysiciansFeeSchedule.pdf")
full_text = "\n".join(page.extract_text() for page in reader.pages)

# Extract (rate, code) pairs from CPT CODE lines
matches = re.findall(r"CPT CODE:\s*\$(\d[\d,\.]*)\s*([A-Z0-9\-]+)", full_text)

# Clean extracted data
data = []
for fee, code in matches:
    fee_clean = float(fee.replace(",", ""))
    code_clean = code.strip()
    data.append((code_clean, fee_clean))

# Create DataFrame
df_alabama = pd.DataFrame(data, columns=['proc_cd', 'rate'])

# Add standard columns
df_alabama['description'] = None
df_alabama['rate_unit'] = 1
df_alabama['is_by_report'] = False
df_alabama['region_type'] = 'state'
df_alabama['region_value'] = 'AL'

# Apply radiology-specific modifier logic
radiology_suffix_map = {
    '5': None,       # 5 = Total Fee → no modifier needed
    'P': '26',       # P = Professional Component
    'TC': 'TC'       # TC = Technical Component
}

df_alabama['suffix'] = df_alabama['proc_cd'].str.extract(r'-([A-Z0-9]+)$')[0]
df_alabama['proc_cd'] = df_alabama['proc_cd'].str.replace(r'-[A-Z0-9]+$', '', regex=True)
df_alabama['modifier'] = df_alabama['suffix'].map(radiology_suffix_map)

# Drop helper column and reorder
df_alabama.drop(columns=['suffix'], inplace=True)
final_cols = ['proc_cd', 'modifier', 'description', 'rate', 'rate_unit', 'is_by_report', 'region_type', 'region_value']
df_alabama = df_alabama[final_cols]
display(df_alabama.head())


Unnamed: 0,proc_cd,modifier,description,rate,rate_unit,is_by_report,region_type,region_value
0,0001U,,,715.61,1,False,state,AL
1,0002U,,,116.94,1,False,state,AL
2,0003U,,,127.01,1,False,state,AL
3,0004U,,,47.86,1,False,state,AL
4,0005U,,,3106.67,1,False,state,AL


'db_import_alabama_radiology.csv'

-- filter to radiology only proc_cd

In [9]:
# Load the Georgia Radiology CSV to get valid proc_cds for radiology
ga_radiology_df = pd.read_csv(r"C:\Users\ChristopherCato\OneDrive - clarity-dx.com\compensation-fee-schedule-app\data\GA\db_import_radiology.csv")
ga_radiology_df['proc_cd'] = ga_radiology_df['proc_cd'].astype(str).str.strip()

# Get unique proc_cds from Georgia Radiology data
radiology_proc_cds = set(ga_radiology_df['proc_cd'].unique())

# Filter Alabama data to include only proc_cds that match radiology ones
df_alabama_radiology = df_alabama[df_alabama['proc_cd'].isin(radiology_proc_cds)].copy()

# Export the filtered version
filtered_output_path = "db_import_alabama_radiology_matched.csv"
df_alabama_radiology.to_csv(filtered_output_path, index=False)

display(df_alabama_radiology.head())

filtered_output_path


Unnamed: 0,proc_cd,modifier,description,rate,rate_unit,is_by_report,region_type,region_value
5454,70010,,,672.59,1,False,state,AL
5455,70010,26.0,,264.5,1,False,state,AL
5456,70015,,,672.59,1,False,state,AL
5457,70015,26.0,,264.5,1,False,state,AL
5458,70030,,,184.67,1,False,state,AL


'db_import_alabama_radiology_matched.csv'