<H1> <I><B> CAPSTONE PROJECT


In [None]:
pip install Faker

Collecting Faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import csv
import json
import random
import uuid
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

# Configuration
NUM_PATIENTS = 10000
NUM_PROVIDERS = 1000
NUM_CLAIMS = 200000
DUPLICATE_RATE = 0.02
NULL_RATE = 0.03

# Output Files
CLAIMS_CSV = "claims.csv"
PATIENTS_CSV = "patients.csv"
PROVIDERS_JSON = "providers.json"

# Generate IDs
patient_ids = [f"pat_{i}" for i in range(1, NUM_PATIENTS + 1)]
provider_ids = [f"prov_{i}" for i in range(1, NUM_PROVIDERS + 1)]

# Common reference values
diagnosis_codes = ['E11.9', 'I10', 'J45.909', 'K21.9', 'M54.5', 'F41.1']
procedure_codes = ['99213', '93000', '80050', '70450', '20550']
submission_modes = ['Online', 'Paper', 'EDI']
claim_statuses = ['Approved', 'Denied', 'Pending']
specialties = ['Cardiology', 'Neurology', 'Dermatology', 'General', 'Psychiatry']
insurance_providers = ['Aetna', 'Cigna', 'BlueCross', 'UnitedHealth']
locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Miami']

# ---------- PROVIDERS ----------
def generate_provider(provider_id):
    return {
        "provider_id": provider_id,
        "npi_number": str(fake.unique.random_number(digits=10)),
        "name": fake.name(),
        "specialty": random.choice(specialties),
        "organization": fake.company(),
        "phone_number": fake.phone_number(),
        "email": fake.email() if random.random() > 0.02 else "",
        "address": fake.address(),
        "city": fake.city(),
        "state": fake.state_abbr(),
        "zip_code": fake.zipcode(),
        "license_number": fake.bothify(text='LIC#######'),
        "license_state": fake.state_abbr(),
        "years_of_experience": random.randint(1, 35),
        "accepting_new_patients": random.choice([True, False])
    }

print("🔄 Generating providers.json...")
providers = [generate_provider(pid) for pid in provider_ids]
with open(PROVIDERS_JSON, "w") as f:
    json.dump(providers, f, indent=2)

# ---------- PATIENTS ----------
def generate_patient(patient_id):
    return {
        "patient_id": patient_id,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "date_of_birth": fake.date_of_birth(minimum_age=18, maximum_age=90),
        "gender": random.choice(['Male', 'Female', 'Other']),
        "phone_number": fake.phone_number() if random.random() > 0.05 else "",
        "email": fake.email(),
        "address": fake.street_address(),
        "city": fake.city() if random.random() > 0.03 else "",
        "insurance_provider": random.choice(insurance_providers),
        "insurance_id": fake.bothify(text='INS-#####'),
        "known_conditions": ", ".join(fake.words(nb=random.randint(1, 3))),
        "last_visit_date": fake.date_this_year()
    }

print("🔄 Generating patients.csv...")
patients = [generate_patient(pid) for pid in patient_ids]
with open(PATIENTS_CSV, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=patients[0].keys())
    writer.writeheader()
    writer.writerows(patients)

# ---------- CLAIMS ----------
def random_date_within_last_year():
    start = datetime.now() - timedelta(days=365)
    return (start + timedelta(days=random.randint(0, 365))).date()

def generate_claim():
    patient_id = random.choice(patient_ids)
    provider_id = random.choice(provider_ids)

    service_start = random_date_within_last_year()
    service_end = service_start + timedelta(days=random.randint(1, 7))
    claim_date = service_end + timedelta(days=random.randint(0, 3))

    claim_amount = round(random.uniform(50, 3000), 2)
    if random.random() < 0.01:
        claim_amount = round(random.uniform(10000, 50000), 2)  # outlier

    paid_amount = round(claim_amount * random.uniform(0.5, 1.0), 2)

    return {
        "claim_id": str(uuid.uuid4()),
        "patient_id": patient_id,
        "provider_id": provider_id,
        "claim_date": claim_date.isoformat(),
        "diagnosis_code": random.choice(diagnosis_codes),
        "procedure_code": random.choice(procedure_codes),
        "claim_amount": claim_amount,
        "paid_amount": paid_amount,
        "is_emergency": random.choice([True, False]),
        "location": random.choice(locations + ["Unknown"] * 2),
        "submission_mode": random.choice(submission_modes),
        "claim_status": random.choice(claim_statuses),
        "service_start_date": service_start.isoformat(),
        "service_end_date": service_end.isoformat(),
        "billing_npi": str(fake.random_number(digits=10))
    }

print("🔄 Generating base claim records...")
claims = [generate_claim() for _ in range(NUM_CLAIMS)]

print("➕ Injecting nulls and duplicates...")
# Add null values
for _ in range(int(NULL_RATE * NUM_CLAIMS)):
    record = random.choice(claims)
    field = random.choice(list(record.keys()))
    record[field] = ""

# Add duplicates
duplicates = random.choices(claims, k=int(DUPLICATE_RATE * NUM_CLAIMS))
claims += duplicates
random.shuffle(claims)

# Save claims.csv
with open(CLAIMS_CSV, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=claims[0].keys())
    writer.writeheader()
    writer.writerows(claims)

print(f"\n✅ Done!")
print(f"• {CLAIMS_CSV} → {len(claims)} rows")
print(f"• {PATIENTS_CSV} → {len(patients)} rows")
print(f"• {PROVIDERS_JSON} → {len(providers)} providers\n")


🔄 Generating providers.json...
🔄 Generating patients.csv...
🔄 Generating base claim records...
➕ Injecting nulls and duplicates...

✅ Done!
• claims.csv → 204000 rows
• patients.csv → 10000 rows
• providers.json → 1000 providers



In [None]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

In [None]:
df_claims = spark.read.csv("file:///content/claims.csv", header=True, inferSchema=True)

In [None]:
df_claims.show()

+--------------------+----------+-----------+----------+--------------+--------------+------------+-----------+------------+-----------+---------------+------------+------------------+----------------+-----------+
|            claim_id|patient_id|provider_id|claim_date|diagnosis_code|procedure_code|claim_amount|paid_amount|is_emergency|   location|submission_mode|claim_status|service_start_date|service_end_date|billing_npi|
+--------------------+----------+-----------+----------+--------------+--------------+------------+-----------+------------+-----------+---------------+------------+------------------+----------------+-----------+
|1cedce0b-0510-423...|  pat_2525|   prov_131|2024-10-05|         F41.1|         20550|      2072.2|    1380.49|        true|Los Angeles|         Online|    Approved|        2024-09-28|      2024-10-04| 7730035281|
|707c9ad3-5737-49e...|  pat_4536|    prov_52|2025-04-11|         M54.5|         20550|     1624.61|    1208.29|        true|    Unknown|        

In [None]:
df_patients = spark.read.csv("file:///content/patients.csv", header=True, inferSchema=True)

In [None]:
df_patients.show()

+----------+----------+----------+-------------+------+--------------------+--------------------+--------------------+------------------+------------------+------------+--------------------+---------------+
|patient_id|first_name| last_name|date_of_birth|gender|        phone_number|               email|             address|              city|insurance_provider|insurance_id|    known_conditions|last_visit_date|
+----------+----------+----------+-------------+------+--------------------+--------------------+--------------------+------------------+------------------+------------+--------------------+---------------+
|     pat_1|      Ryan|Mclaughlin|   1951-12-12| Other|+1-650-256-6781x7190|   ggray@example.com|120 Brandon Mount...|    South Jodyfort|             Aetna|   INS-99606|  Congress, show, TV|     2025-05-11|
|     pat_2|     Holly|      Holt|   1975-01-02|  Male|        716.817.3340|morganward@exampl...|  235 Vanessa Valley|  Port Andrewmouth|             Cigna|   INS-98271|   

In [None]:
df_patients.head()

Row(patient_id='pat_1', first_name='Ryan', last_name='Mclaughlin', date_of_birth=datetime.date(1951, 12, 12), gender='Other', phone_number='+1-650-256-6781x7190', email='ggray@example.com', address='120 Brandon Mountain Apt. 474', city='South Jodyfort', insurance_provider='Aetna', insurance_id='INS-99606', known_conditions='Congress, show, TV', last_visit_date=datetime.date(2025, 5, 11))

In [None]:
df_providers = spark.read.option("multiline", True).json("file:///content/providers.json")

In [None]:
df_providers.show()

+----------------------+--------------------+-----------------+--------------------+--------------+-------------+------------------+----------+--------------------+--------------------+-----------+-----------+-----+-------------------+--------+
|accepting_new_patients|             address|             city|               email|license_number|license_state|              name|npi_number|        organization|        phone_number|provider_id|  specialty|state|years_of_experience|zip_code|
+----------------------+--------------------+-----------------+--------------------+--------------+-------------+------------------+----------+--------------------+--------------------+-----------+-----------+-----+-------------------+--------+
|                 false|336 Moss Cliffs\n...|      South Laura|kelsey79@example.org|    LIC2382909|           MP|         Rick Vang|1272565345|          Parker Ltd|     +1-713-322-9838|     prov_1|  Neurology|   UT|                 14|   44487|
|                  t

In [None]:
print("Bronze Layer Done")

In [1]:
print("shenan connected")

shenan connected
