In [4]:
from faker import Faker
import json
import csv
import pandas as pd
import random

# Initialize Faker
fake = Faker()

json_folder = 'data/json_files/'
csv_folder = 'data/csv-files/'

In [6]:
def generate_pii_json(num_patients=100, max_claims_per_patient=5):
    data = {
        "patients": [],
        "claims": []
    }
    
    # Generate patient data
    for _ in range(num_patients):
        patient_id = fake.uuid4()
        patient_data = {
            "patient_id": patient_id,
            "name": fake.name(),
            "date_of_birth": fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d"),
            "address": fake.address()
        }
        data["patients"].append(patient_data)
        
        # Generate claims for this patient
        num_claims = random.randint(1, max_claims_per_patient)
        for _ in range(num_claims):
            claim_data = {
                "claim_id": fake.uuid4(),
                "amount": round(random.uniform(100, 5000), 2),  # Random amount between 100 and 5000
                "patient_id": patient_id
            }
            data["claims"].append(claim_data)

    return data

In [2]:
def generate_pii_csv(output_file: str, num_records: int):
    """
    Generate a CSV file with fake PII data.
    :param output_file: Path to the CSV file to create.
    :param num_records: Number of records to generate.
    """
    fake = Faker()

    # Define the columns for the CSV
    columns = [
        "first_name", 
        "last_name", 
        "birth_date", 
        "address", 
        "city", 
        "state", 
        "zip_code", 
        "email", 
        "phone_number", 
        "ssn", 
        "drivers_license"
    ]

    # Open the output file and write the data
    with open(output_file, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=columns)
        writer.writeheader()  # Write the header row

        for _ in range(num_records):
            # Generate a row of fake data
            record = {
                "first_name": fake.first_name(),
                "last_name": fake.last_name(),
                "birth_date": fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d"),
                "address": fake.street_address(),
                "city": fake.city(),
                "state": fake.state(),
                "zip_code": fake.zipcode(),
                "email": fake.email(),
                "phone_number": fake.phone_number(),
                "ssn": fake.ssn(),
                "drivers_license": fake.license_plate()  # Faker does not have a driver's license, but we can use license_plate
            }

            # Write the record to the CSV file
            writer.writerow(record)

    print(f"CSV file '{output_file}' with {num_records} records has been created.")

In [None]:
# Generate data with 100 patients and up to 5 claims per patient
data = generate_pii_json(num_patients=10, max_claims_per_patient=5)

# Save to JSON file
with open(f"{json_folder}patients_claims_data.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

# Print sample data
print(json.dumps(data, indent=4))

{
    "patients": [
        {
            "patient_id": "139f40a8-da41-4a5f-b4c1-7e38b2748492",
            "name": "Amber Craig",
            "date_of_birth": "1999-08-16",
            "address": "3604 Norris Junction\nSouth Ericaville, IA 31387"
        },
        {
            "patient_id": "9f81529d-6603-484d-8e8e-507f8e23676d",
            "name": "Ryan Park",
            "date_of_birth": "1968-02-04",
            "address": "8434 Rivers Lock Suite 797\nWest Tanya, PR 52810"
        },
        {
            "patient_id": "dd395454-3ad6-469c-ab51-c55422bea4ff",
            "name": "Eric Davenport",
            "date_of_birth": "1986-01-14",
            "address": "787 Alejandro Springs Suite 533\nLindsayport, CT 20635"
        },
        {
            "patient_id": "7700de5b-d22c-481f-9369-9b6b418b00a9",
            "name": "Gabriella Campbell",
            "date_of_birth": "1974-03-27",
            "address": "1428 Hubbard Dale\nSouth Michael, NV 00659"
        },
        {
      

In [8]:
# Generating CSV PII data
generate_pii_csv(f"{csv_folder}patients_pii_data.csv", 100)

CSV file 'data/csv-files/patients_pii_data.csv' with 100 records has been created.


In [None]:
from utils.load_to_s3 import upload_files

upload_files(f"{csv_folder}patients_pii_data.csv", 'pii-data-1129', 'patients_pii_data.csv')

File was successfully uploaded to the bucket called pii-data-1129 with the name data/csv-files/patients_pii_data.csv.


In [5]:
#Reading CSV PII data file from local folder
pii_df = pd.read_csv(f"{csv_folder}patients_pii_data.csv")
pii_df.head()

Unnamed: 0,first_name,last_name,birth_date,address,city,state,zip_code,email,phone_number,ssn,drivers_license
0,Carla,Le,1960-02-18,17140 Bryan Turnpike Suite 604,Brittanystad,Rhode Island,39660,jacksonrose@example.com,+1-423-535-1326,310-02-1588,NO6 O8O
1,Sarah,Bailey,1941-10-26,805 Ashley Crossing,Port Walterland,New Hampshire,14015,hlawson@example.org,964.758.5419,481-53-8785,825-VHH
2,Michael,Grant,1976-04-01,454 Regina Squares,New Jason,Wyoming,24975,fhill@example.com,550-455-1498,475-77-4644,RT 07339
3,David,Crawford,1944-11-14,9965 Helen Manors,North Marybury,South Carolina,19701,ipace@example.net,+1-844-468-7557x74885,119-81-5536,QTV-3149
4,Jessica,Harris,1954-12-08,5724 Tom Roads Suite 679,East Jessica,Washington,71216,carrie53@example.com,(577)653-8717x16128,841-26-6618,328-TWK
