In [22]:
#!pip install faker
#!pip install allpairspy

In [23]:
from faker import Faker
import random
from datetime import datetime, timedelta
import json
from allpairspy import AllPairs

In [10]:

# Initialize Faker for generating realistic data
fake = Faker('en_US') # Using US locale for common healthcare data patterns

# --- Configuration ---
NUM_MEMBERS = 200
NUM_PROVIDERS = 50
NUM_CLAIMS = 1000
MAX_LINES_PER_CLAIM = 3 # Max number of service lines per claim

# Common ICD-10 Diagnosis Codes (Examples)
DIAGNOSIS_CODES = {
    "J06.9": "Acute upper respiratory infection, unspecified",
    "I10": "Essential (primary) hypertension",
    "E11.9": "Type 2 diabetes mellitus without complications",
    "K21.9": "Gastro-esophageal reflux disease without esophagitis",
    "R51": "Headache",
    "N39.0": "Urinary tract infection, site not specified",
    "M54.5": "Low back pain",
    "F41.9": "Anxiety disorder, unspecified",
    "Z00.00": "Encounter for general adult medical examination without abnormal findings"
}

# Common CPT Procedure Codes (Examples with approximate typical billed amounts)
PROCEDURE_CODES = {
    "99213": {"desc": "Office visit, established pt, 15-29 mins", "avg_bill": 120},
    "99203": {"desc": "Office visit, new pt, 30-44 mins", "avg_bill": 180},
    "80061": {"desc": "Lipid panel", "avg_bill": 40},
    "90672": {"desc": "Influenza vaccine", "avg_bill": 75},
    "71046": {"desc": "Chest X-ray, 2 views", "avg_bill": 150},
    "90837": {"desc": "Psychotherapy, 60 mins", "avg_bill": 250},
    "81002": {"desc": "Urinalysis", "avg_bill": 30},
    "99204": {"desc": "Office visit, new pt, 45-59 mins", "avg_bill": 250},
    "99214": {"desc": "Office visit, established pt, 30-39 mins", "avg_bill": 180},
    "36415": {"desc": "Venipuncture", "avg_bill": 20}
}

# Common CPT Modifiers
MODIFIERS = [None, "25", "59"] # None for no modifier

# Place of Service Codes
POS_CODES = {
    "11": "Office",
    "21": "Inpatient Hospital",
    "22": "Outpatient Hospital",
    "12": "Home"
}

# Adjudication Remark Codes (for denied claims)
REMARK_CODES = [
    "MA01", # Missing/incomplete/invalid information
    "CO45", # Charge exceeds fee schedule/maximum allowable
    "PR204", # This service/equipment/drug is not covered under the patient's current benefit plan.
    "CO97"  # The benefit for this service is included in the payment/allowance for another service.
]

# --- Data Structures ---
class Member:
    def __init__(self, member_id):
        self.member_id = member_id
        self.first_name = fake.first_name()
        self.last_name = fake.last_name()
        self.date_of_birth = fake.date_of_birth(minimum_age=1, maximum_age=90).strftime('%Y-%m-%d')
        self.gender = random.choice(['M', 'F'])
        self.address = fake.address().replace('\n', ', ')
        self.city = fake.city()
        self.state = fake.state_abbr()
        self.zip_code = fake.postcode()

class Provider:
    def __init__(self, provider_id):
        self.provider_id = provider_id
        self.name = fake.company() if random.random() < 0.3 else fake.name() + (" MD" if random.random() < 0.7 else " DO")
        self.specialty = random.choice(['Family Medicine', 'Pediatrics', 'Internal Medicine', 'Cardiology', 'Orthopedics', 'Psychiatry', 'Dermatology'])
        self.npi = fake.unique.numerify('##########') # 10 digit NPI
        self.address = fake.address().replace('\n', ', ')
        self.city = fake.city()
        self.state = fake.state_abbr()
        self.zip_code = fake.postcode()

class Plan:
    def __init__(self, plan_id, name, type, copay, deductible, coinsurance):
        self.plan_id = plan_id
        self.name = name
        self.type = type # HMO, PPO, EPO
        self.copay = copay # Fixed amount
        self.deductible = deductible # Amount before coinsurance
        self.coinsurance = coinsurance # Percentage

# --- Helper Functions ---
def generate_date_range(start_date, end_date):
    start_dt = datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.strptime(end_date, '%Y-%m-%d')
    random_days = random.randint(0, (end_dt - start_dt).days)
    return (start_dt + timedelta(days=random_days)).strftime('%Y-%m-%d')

def calculate_adjudication(billed_amount, plan_details):
    # Simplified adjudication logic
    copay = plan_details.copay
    deductible_remaining = plan_details.deductible # Assume for simplicity, deductible resets per claim here
    coinsurance_rate = plan_details.coinsurance

    allowed_amount = billed_amount * random.uniform(0.7, 0.95) # 70-95% of billed amount allowed

    member_copay = min(copay, allowed_amount)
    amount_after_copay = allowed_amount - member_copay

    member_deductible = min(deductible_remaining, amount_after_copay)
    amount_after_deductible = amount_after_deductible = amount_after_copay - member_deductible

    member_coinsurance = amount_after_deductible * coinsurance_rate
    paid_amount = amount_after_deductible - member_coinsurance

    member_liability = member_copay + member_deductible + member_coinsurance

    return {
        "LINE_ALLOWED_AMOUNT": round(allowed_amount, 2),
        "LINE_PAID_AMOUNT": round(paid_amount, 2),
        "LINE_MEMBER_COPAY": round(member_copay, 2),
        "LINE_MEMBER_DEDUCTIBLE": round(member_deductible, 2),
        "LINE_MEMBER_COINSURANCE": round(member_coinsurance, 2),
        "LINE_MEMBER_LIABILITY": round(member_liability, 2) # Added for clarity
    }

# --- Generation Logic ---
def generate_synthetic_claims():
    members = [Member(f"MEM{str(i+1).zfill(4)}") for i in range(NUM_MEMBERS)]
    providers = [Provider(f"PROV{str(i+1).zfill(3)}") for i in range(NUM_PROVIDERS)]
    
    # Define some plans with varying benefit structures
    plans = [
        Plan("PLAN001", "BlueCare PPO", "PPO", copay=30, deductible=500, coinsurance=0.20),
        Plan("PLAN002", "HealthNet HMO", "HMO", copay=15, deductible=100, coinsurance=0.10),
        Plan("PLAN003", "SecureChoice EPO", "EPO", copay=25, deductible=1000, coinsurance=0.25)
    ]

    all_claims_data = []

    for i in range(NUM_CLAIMS):
        claim_id = f"CLM{str(i+1).zfill(6)}"
        member = random.choice(members)
        provider = random.choice(providers)
        plan = random.choice(plans)

        service_start_date = generate_date_range('2023-01-01', '2024-12-31')
        service_end_date = service_start_date # For simplicity, single-day services
        claim_submission_date = (datetime.strptime(service_start_date, '%Y-%m-%d') + timedelta(days=random.randint(5, 30))).strftime('%Y-%m-%d')

        diagnosis_code, diagnosis_desc = random.choice(list(DIAGNOSIS_CODES.items()))

        claim_line_items = []
        total_billed_amount = 0.0
        total_paid_amount = 0.0
        total_member_liability = 0.0
        claim_status = "Processed"
        claim_remark_code = None

        # Decide if the claim will be denied (e.g., 10% chance)
        is_denied = random.random() < 0.10
        if is_denied:
            claim_status = "Denied"
            claim_remark_code = random.choice(REMARK_CODES)

        num_lines = random.randint(1, MAX_LINES_PER_CLAIM)
        for line_num in range(1, num_lines + 1):
            procedure_code_key = random.choice(list(PROCEDURE_CODES.keys()))
            procedure_info = PROCEDURE_CODES[procedure_code_key]
            
            units = 1
            if procedure_code_key in ["80061", "81002"]: # Labs might have more units for panels/tests
                units = random.randint(1, 3)

            line_billed_amount = round(procedure_info["avg_bill"] * units * random.uniform(0.8, 1.2), 2) # Add some variability

            adjudication_results = {
                "LINE_ALLOWED_AMOUNT": 0.0,
                "LINE_PAID_AMOUNT": 0.0,
                "LINE_MEMBER_COPAY": 0.0,
                "LINE_MEMBER_DEDUCTIBLE": 0.0,
                "LINE_MEMBER_COINSURANCE": 0.0,
                "LINE_MEMBER_LIABILITY": 0.0
            }
            remark_code_line = None

            if not is_denied:
                adjudication_results = calculate_adjudication(line_billed_amount, plan)
                # If a line item is denied, its paid/allowed amounts are 0 and member liability is billed
                # This could be more complex with partial denials or adjustments
                if random.random() < 0.05: # Small chance a line item is denied even if claim processed
                    adjudication_results = {k: 0.0 for k in adjudication_results} # Set all to zero
                    adjudication_results["LINE_MEMBER_LIABILITY"] = line_billed_amount # Member pays full billed for denied line
                    remark_code_line = random.choice(REMARK_CODES)
            else:
                # If claim is denied, typically everything is denied at line level
                adjudication_results["LINE_MEMBER_LIABILITY"] = line_billed_amount
                remark_code_line = claim_remark_code # Use claim-level denial reason for lines

            claim_line_items.append({
                "CLAIM_ID": claim_id,
                "LINE_NUMBER": line_num,
                "SERVICE_START_DATE": service_start_date,
                "SERVICE_END_DATE": service_end_date,
                "PROCEDURE_CODE": procedure_code_key,
                "MODIFIER": random.choice(MODIFIERS),
                "DIAGNOSIS_CODE_PRIMARY": diagnosis_code,
                "UNITS": units,
                "LINE_BILLED_AMOUNT": line_billed_amount,
                "LINE_ALLOWED_AMOUNT": adjudication_results["LINE_ALLOWED_AMOUNT"],
                "LINE_PAID_AMOUNT": adjudication_results["LINE_PAID_AMOUNT"],
                "LINE_MEMBER_COPAY": adjudication_results["LINE_MEMBER_COPAY"],
                "LINE_MEMBER_DEDUCTIBLE": adjudication_results["LINE_MEMBER_DEDUCTIBLE"],
                "LINE_MEMBER_COINSURANCE": adjudication_results["LINE_MEMBER_COINSURANCE"],
                "PLACE_OF_SERVICE_CODE": random.choice(list(POS_CODES.keys())),
                "REMARK_CODE_1": remark_code_line # Line-level remark code
            })
            total_billed_amount += line_billed_amount
            total_paid_amount += adjudication_results["LINE_PAID_AMOUNT"]
            total_member_liability += adjudication_results["LINE_MEMBER_LIABILITY"]

        # Ensure totals reflect sums of lines
        total_billed_amount = round(total_billed_amount, 2)
        total_paid_amount = round(total_paid_amount, 2)
        total_member_liability = round(total_member_liability, 2)

        all_claims_data.append({
            "CLAIM_HEADER": {
                "CLAIM_ID": claim_id,
                "MEMBER_ID": member.member_id,
                "PROVIDER_ID": provider.provider_id,
                "PLAN_ID": plan.plan_id,
                "SERVICE_START_DATE": service_start_date,
                "SERVICE_END_DATE": service_end_date,
                "CLAIM_SUBMISSION_DATE": claim_submission_date,
                "DIAGNOSIS_CODE_PRIMARY": diagnosis_code,
                "CLAIM_STATUS": claim_status,
                "CLAIM_DENIAL_REASON_CODE": claim_remark_code, # Claim-level remark code
                "TOTAL_BILLED_AMOUNT": total_billed_amount,
                "TOTAL_PAID_AMOUNT": total_paid_amount,
                "TOTAL_MEMBER_LIABILITY": total_member_liability
            },
            "CLAIM_LINE_ITEMS": claim_line_items
        })
    
    return all_claims_data, members, providers, plans

# --- Run Generation and Save ---
if __name__ == "__main__":
    print("Generating synthetic healthcare claims data...")
    claims, members_data, providers_data, plans_data = generate_synthetic_claims()
    print(f"Generated {len(claims)} claims.")

    # Save claims data to a JSON file
    output_filename = "synthetic_facets_claims_1000.json"
    with open(output_filename, 'w') as f:
        json.dump(claims, f, indent=2)
    print(f"Claims data saved to {output_filename}")

    # Optionally, save member, provider, and plan data separately for lookup
    members_list = [{
        "MEMBER_ID": m.member_id,
        "FIRST_NAME": m.first_name,
        "LAST_NAME": m.last_name,
        "DATE_OF_BIRTH": m.date_of_birth,
        "GENDER": m.gender,
        "ADDRESS": m.address,
        "CITY": m.city,
        "STATE": m.state,
        "ZIP_CODE": m.zip_code
    } for m in members_data]
    with open("synthetic_members.json", 'w') as f:
        json.dump(members_list, f, indent=2)
    print("Member data saved to synthetic_members.json")

    providers_list = [{
        "PROVIDER_ID": p.provider_id,
        "NAME": p.name,
        "SPECIALTY": p.specialty,
        "NPI": p.npi,
        "ADDRESS": p.address,
        "CITY": p.city,
        "STATE": p.state,
        "ZIP_CODE": p.zip_code
    } for p in providers_data]
    with open("synthetic_providers.json", 'w') as f:
        json.dump(providers_list, f, indent=2)
    print("Provider data saved to synthetic_providers.json")

    plans_list = [{
        "PLAN_ID": p.plan_id,
        "NAME": p.name,
        "TYPE": p.type,
        "COPAY": p.copay,
        "DEDUCTIBLE": p.deductible,
        "COINSURANCE": p.coinsurance
    } for p in plans_data]
    with open("synthetic_plans.json", 'w') as f:
        json.dump(plans_list, f, indent=2)
    print("Plan data saved to synthetic_plans.json")

    print("\nGeneration complete!")

Generating synthetic healthcare claims data...
Generated 1000 claims.
Claims data saved to synthetic_facets_claims_1000.json
Member data saved to synthetic_members.json
Provider data saved to synthetic_providers.json
Plan data saved to synthetic_plans.json

Generation complete!


In [11]:
import pandas as pd
import json

# Define the path to your claims JSON file
claims_file_path = 'synthetic_facets_claims_1000.json'

# Load the JSON data
with open(claims_file_path, 'r') as f:
    claims_data = json.load(f)

# Normalize the 'CLAIM_HEADER' part
# We treat each element in the claims_data list as a record.
# The 'CLAIM_HEADER' is a nested dictionary that we want to flatten.
# The 'CLAIM_LINE_ITEMS' is a list within each record that needs separate normalization.
df_claims_header = pd.json_normalize(
    claims_data,
    record_path=None, # No specific record_path here, as the main objects are the claims themselves
    meta=['CLAIM_HEADER'] # We want to pull fields from CLAIM_HEADER
)

# Rename columns to flatten the 'CLAIM_HEADER.' prefix
df_claims_header.columns = [col.replace('CLAIM_HEADER.', '') for col in df_claims_header.columns]

# Now, normalize the 'CLAIM_LINE_ITEMS'
# This requires iterating through each claim and its line items
all_line_items = []
for claim in claims_data:
    claim_id = claim['CLAIM_HEADER']['CLAIM_ID']
    for line_item in claim['CLAIM_LINE_ITEMS']:
        # Add the CLAIM_ID from the header to each line item for linking
        line_item['CLAIM_ID'] = claim_id
        all_line_items.append(line_item)

df_claims_lines = pd.DataFrame(all_line_items)

print("Claims Header DataFrame (first 5 rows):")
print(df_claims_header.head())
print("\nClaims Line Items DataFrame (first 5 rows):")
print(df_claims_lines.head())

print(f"\nClaims Header DataFrame shape: {df_claims_header.shape}")
print(f"Claims Line Items DataFrame shape: {df_claims_lines.shape}")

Claims Header DataFrame (first 5 rows):
                                    CLAIM_LINE_ITEMS   CLAIM_ID MEMBER_ID  \
0  [{'CLAIM_ID': 'CLM000001', 'LINE_NUMBER': 1, '...  CLM000001   MEM0163   
1  [{'CLAIM_ID': 'CLM000002', 'LINE_NUMBER': 1, '...  CLM000002   MEM0198   
2  [{'CLAIM_ID': 'CLM000003', 'LINE_NUMBER': 1, '...  CLM000003   MEM0029   
3  [{'CLAIM_ID': 'CLM000004', 'LINE_NUMBER': 1, '...  CLM000004   MEM0119   
4  [{'CLAIM_ID': 'CLM000005', 'LINE_NUMBER': 1, '...  CLM000005   MEM0190   

  PROVIDER_ID  PLAN_ID SERVICE_START_DATE SERVICE_END_DATE  \
0     PROV009  PLAN003         2024-07-31       2024-07-31   
1     PROV003  PLAN003         2024-12-16       2024-12-16   
2     PROV001  PLAN002         2024-10-08       2024-10-08   
3     PROV018  PLAN001         2024-12-30       2024-12-30   
4     PROV029  PLAN001         2023-11-06       2023-11-06   

  CLAIM_SUBMISSION_DATE DIAGNOSIS_CODE_PRIMARY CLAIM_STATUS  \
0            2024-08-28                    R51       Denied  

In [12]:
# Define the path to your members JSON file
members_file_path = 'synthetic_members.json'

# Read the JSON file directly into a DataFrame
df_members = pd.read_json(members_file_path)

print("\nMembers DataFrame (first 5 rows):")
print(df_members.head())
print(f"\nMembers DataFrame shape: {df_members.shape}")


Members DataFrame (first 5 rows):
  MEMBER_ID FIRST_NAME LAST_NAME DATE_OF_BIRTH GENDER  \
0   MEM0001    Matthew   Spencer    1991-08-14      F   
1   MEM0002      Robin     Gates    1980-02-29      M   
2   MEM0003    William     Lopez    2006-03-22      F   
3   MEM0004    Douglas      Long    1946-01-14      M   
4   MEM0005     Joseph    Taylor    2011-04-13      F   

                                             ADDRESS              CITY STATE  \
0                   Unit 5688 Box 2344, DPO AA 75012  East Taylorhaven    MO   
1  93706 Tyler Plains Suite 731, West Christopher...         Clarkstad    VT   
2  6417 Zamora Crossing Suite 302, Dylanborough, ...    North Kimberly    MT   
3  925 Sabrina Extensions Apt. 630, Penningtonbor...       Jefferytown    FM   
4  04396 Angel Walks Apt. 441, East Matthew, IN 9...         Eatonland    MA   

   ZIP_CODE  
0     75432  
1      6803  
2     87637  
3     63897  
4     74767  

Members DataFrame shape: (200, 9)


In [13]:
# Define the path to your providers JSON file
providers_file_path = 'synthetic_providers.json'

# Read the JSON file directly into a DataFrame
df_providers = pd.read_json(providers_file_path)

print("\nProviders DataFrame (first 5 rows):")
print(df_providers.head())
print(f"\nProviders DataFrame shape: {df_providers.shape}")


Providers DataFrame (first 5 rows):
  PROVIDER_ID                NAME        SPECIALTY         NPI  \
0     PROV001     Terry Garcia MD       Cardiology  4621735579   
1     PROV002  Jennifer Turner MD      Orthopedics  6790835900   
2     PROV003  Zachary Spencer DO       Cardiology  1781446297   
3     PROV004      Jimenez-Peters  Family Medicine  3119503147   
4     PROV005      Karen Smith MD      Orthopedics  8906101802   

                                             ADDRESS              CITY STATE  \
0  9804 Freeman Walks Apt. 570, South Sherrychest...    Port Elizabeth    ID   
1  5018 Janet Mount Suite 858, East Jenniferside,...        Smithmouth    HI   
2  869 Christine Mills Apt. 373, Lake Stephaniemo...      Hollowaybury    AK   
3  700 Gentry Tunnel Suite 702, Torresbury, NE 24141       North Jamie    NV   
4         71009 Sarah Junction, Thomasfort, NH 29990  New Cindychester    MS   

   ZIP_CODE  
0     92339  
1      7655  
2     56901  
3     91760  
4     18753  



In [14]:
# Define the path to your plans JSON file
plans_file_path = 'synthetic_plans.json'

# Read the JSON file directly into a DataFrame
df_plans = pd.read_json(plans_file_path)

print("\nPlans DataFrame (first 5 rows):")
print(df_plans.head())
print(f"\nPlans DataFrame shape: {df_plans.shape}")


Plans DataFrame (first 5 rows):
   PLAN_ID              NAME TYPE  COPAY  DEDUCTIBLE  COINSURANCE
0  PLAN001      BlueCare PPO  PPO     30         500         0.20
1  PLAN002     HealthNet HMO  HMO     15         100         0.10
2  PLAN003  SecureChoice EPO  EPO     25        1000         0.25

Plans DataFrame shape: (3, 6)


In [15]:
# Merge claims header with members
df_claims_enriched = pd.merge(
    df_claims_header,
    df_members,
    left_on='MEMBER_ID',
    right_on='MEMBER_ID',
    how='left', # Keep all claims, add member info where available
    suffixes=('_claim', '_member') # To distinguish columns with same names
)

# Merge with providers
df_claims_enriched = pd.merge(
    df_claims_enriched,
    df_providers,
    left_on='PROVIDER_ID',
    right_on='PROVIDER_ID',
    how='left',
    suffixes=('_claims', '_provider') # Another suffix for new merge
)

# Merge with plans
df_claims_enriched = pd.merge(
    df_claims_enriched,
    df_plans,
    left_on='PLAN_ID',
    right_on='PLAN_ID',
    how='left',
    suffixes=('_claims', '_plan')
)

print("\nEnriched Claims Header DataFrame (first 5 rows with merged data):")
print(df_claims_enriched.head())
print(f"\nEnriched Claims Header DataFrame shape: {df_claims_enriched.shape}")

# You would do similar merges if you want to link claim lines to members/providers/plans
# (though usually the header holds the main linkages)


Enriched Claims Header DataFrame (first 5 rows with merged data):
                                    CLAIM_LINE_ITEMS   CLAIM_ID MEMBER_ID  \
0  [{'CLAIM_ID': 'CLM000001', 'LINE_NUMBER': 1, '...  CLM000001   MEM0163   
1  [{'CLAIM_ID': 'CLM000002', 'LINE_NUMBER': 1, '...  CLM000002   MEM0198   
2  [{'CLAIM_ID': 'CLM000003', 'LINE_NUMBER': 1, '...  CLM000003   MEM0029   
3  [{'CLAIM_ID': 'CLM000004', 'LINE_NUMBER': 1, '...  CLM000004   MEM0119   
4  [{'CLAIM_ID': 'CLM000005', 'LINE_NUMBER': 1, '...  CLM000005   MEM0190   

  PROVIDER_ID  PLAN_ID SERVICE_START_DATE SERVICE_END_DATE  \
0     PROV009  PLAN003         2024-07-31       2024-07-31   
1     PROV003  PLAN003         2024-12-16       2024-12-16   
2     PROV001  PLAN002         2024-10-08       2024-10-08   
3     PROV018  PLAN001         2024-12-30       2024-12-30   
4     PROV029  PLAN001         2023-11-06       2023-11-06   

  CLAIM_SUBMISSION_DATE DIAGNOSIS_CODE_PRIMARY CLAIM_STATUS  ...         NPI  \
0            2024

In [16]:
df_claims_lines

Unnamed: 0,CLAIM_ID,LINE_NUMBER,SERVICE_START_DATE,SERVICE_END_DATE,PROCEDURE_CODE,MODIFIER,DIAGNOSIS_CODE_PRIMARY,UNITS,LINE_BILLED_AMOUNT,LINE_ALLOWED_AMOUNT,LINE_PAID_AMOUNT,LINE_MEMBER_COPAY,LINE_MEMBER_DEDUCTIBLE,LINE_MEMBER_COINSURANCE,PLACE_OF_SERVICE_CODE,REMARK_CODE_1
0,CLM000001,1,2024-07-31,2024-07-31,99214,25,R51,1,198.27,0.00,0.0,0.00,0.00,0.0,12,CO45
1,CLM000002,1,2024-12-16,2024-12-16,99204,,N39.0,1,237.59,0.00,0.0,0.00,0.00,0.0,22,CO97
2,CLM000002,2,2024-12-16,2024-12-16,99213,59,N39.0,1,113.58,95.39,0.0,25.00,70.39,0.0,12,
3,CLM000002,3,2024-12-16,2024-12-16,71046,59,N39.0,1,147.08,138.80,0.0,25.00,113.80,0.0,21,
4,CLM000003,1,2024-10-08,2024-10-08,90837,,E11.9,1,276.03,0.00,0.0,0.00,0.00,0.0,12,PR204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1974,CLM000998,3,2024-08-09,2024-08-09,90837,25,Z00.00,1,223.33,186.32,0.0,25.00,161.32,0.0,11,
1975,CLM000999,1,2023-02-10,2023-02-10,36415,59,I10,1,16.33,12.22,0.0,12.22,0.00,0.0,11,
1976,CLM001000,1,2023-09-18,2023-09-18,99214,59,R51,1,196.72,170.07,0.0,30.00,140.07,0.0,12,
1977,CLM001000,2,2023-09-18,2023-09-18,90672,25,R51,1,62.80,57.23,0.0,30.00,27.23,0.0,21,


In [9]:
df_claims_header

Unnamed: 0,CLAIM_LINE_ITEMS,CLAIM_ID,MEMBER_ID,PROVIDER_ID,PLAN_ID,SERVICE_START_DATE,SERVICE_END_DATE,CLAIM_SUBMISSION_DATE,DIAGNOSIS_CODE_PRIMARY,CLAIM_STATUS,CLAIM_DENIAL_REASON_CODE,TOTAL_BILLED_AMOUNT,TOTAL_PAID_AMOUNT,TOTAL_MEMBER_LIABILITY
0,"[{'CLAIM_ID': 'CLM000001', 'LINE_NUMBER': 1, '...",CLM000001,MEM0102,PROV023,PLAN002,2024-01-31,2024-01-31,2024-02-13,J06.9,Processed,,229.70,10.41,173.99
1,"[{'CLAIM_ID': 'CLM000002', 'LINE_NUMBER': 1, '...",CLM000002,MEM0030,PROV004,PLAN001,2024-03-14,2024-03-14,2024-04-11,E11.9,Processed,,22.83,0.00,16.93
2,"[{'CLAIM_ID': 'CLM000003', 'LINE_NUMBER': 1, '...",CLM000003,MEM0041,PROV009,PLAN001,2023-04-26,2023-04-26,2023-05-12,Z00.00,Processed,,330.05,0.00,306.68
3,"[{'CLAIM_ID': 'CLM000004', 'LINE_NUMBER': 1, '...",CLM000004,MEM0149,PROV021,PLAN001,2023-04-19,2023-04-19,2023-05-07,M54.5,Processed,,202.91,0.00,171.75
4,"[{'CLAIM_ID': 'CLM000005', 'LINE_NUMBER': 1, '...",CLM000005,MEM0200,PROV020,PLAN003,2023-12-11,2023-12-11,2024-01-02,F41.9,Processed,,248.57,0.00,207.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"[{'CLAIM_ID': 'CLM000996', 'LINE_NUMBER': 1, '...",CLM000996,MEM0184,PROV011,PLAN002,2023-11-07,2023-11-07,2023-12-07,R51,Processed,,119.07,0.00,107.57
996,"[{'CLAIM_ID': 'CLM000997', 'LINE_NUMBER': 1, '...",CLM000997,MEM0084,PROV006,PLAN003,2024-09-08,2024-09-08,2024-10-03,N39.0,Processed,,149.81,0.00,122.04
997,"[{'CLAIM_ID': 'CLM000998', 'LINE_NUMBER': 1, '...",CLM000998,MEM0185,PROV025,PLAN003,2023-06-17,2023-06-17,2023-07-09,M54.5,Processed,,348.25,0.00,285.49
998,"[{'CLAIM_ID': 'CLM000999', 'LINE_NUMBER': 1, '...",CLM000999,MEM0183,PROV003,PLAN001,2023-10-10,2023-10-10,2023-10-16,M54.5,Processed,,155.23,0.00,119.29


## **Reading and creating all pairs**

In [18]:
#!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [19]:
df_proc_diag = pd.read_excel("Diag_Proc_Code_AllPairs.xlsx")
df_proc_diag

Unnamed: 0,DiagCode,ProcCode
0,J06.9,99213
1,I10,99203
2,E11.9,80061
3,K21.9,90672
4,R51,71046
5,N39.0,90837
6,M54.5,81002
7,F41.9,99204
8,Z00.00,99214
9,,36415


In [32]:
p = [df_proc_diag[col].dropna().tolist() for col in df_proc_diag.columns]
coombinations = AllPairs(p)


In [33]:
df_combinations = pd.DataFrame(list(combinations), columns = df_proc_diag.columns)
df_combinations

Unnamed: 0,DiagCode,ProcCode
0,J06.9,99213
1,I10,99213
2,E11.9,99213
3,K21.9,99213
4,R51,99213
...,...,...
85,R51,36415
86,K21.9,36415
87,E11.9,36415
88,I10,36415


In [43]:
df_source_rates = pd.read_excel("Proc_code_Rate_Allpairs.xlsx")
df_source_dict = df_source_rates.set_index('ProcCode')['Allowed_Amount'].to_dict()
df_source_dict

{99213: 120,
 99203: 180,
 80061: 40,
 90672: 75,
 71046: 150,
 90837: 250,
 81002: 30,
 99204: 250,
 99214: 180,
 36415: 20}

In [44]:
df_combinations['Allowed_Amount'] = df_combinations['ProcCode'].apply(
    lambda x: df_source_dict.get(x, None) # Or 0, or pd.NA
)

In [45]:
df_combinations

Unnamed: 0,DiagCode,ProcCode,Allowed_Amount
0,J06.9,99213,120
1,I10,99213,120
2,E11.9,99213,120
3,K21.9,99213,120
4,R51,99213,120
...,...,...,...
85,R51,36415,20
86,K21.9,36415,20
87,E11.9,36415,20
88,I10,36415,20
