In [1]:
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

<!-- #region -->
# Claims Data Case Study

This document explains how the data fabricator utility could be utilised to generate commercial events data and all its associated dimensions.


## Entity relationship diagram

![ER Diagram](images/claims_case_study_erd.svg)


## Table definition

Patient: Dimension table containing patient information.

| Column name | Column logic                               |
|-------------|--------------------------------------------|
| patient_id  | unique ID prefixed with pt_                |
| gender      | could contain one of the 3 values: m, f, u |
| birth_year  | year                                       |


Provider: Dimension table containing provider information.

| Column name | Column logic                       |
|-------------|------------------------------------|
| provider_id | unique id; prefixed with phys      |
| first_name  | String value                       |
| last_name   | String value                       |
| state       | String value from a particular set |
| zip         | US zip codes                       |
| speciality  | Values from a list of specialty    |


Diagnosis: Dimension table containing diagnosis details.

| Column name      | Column logic                                           |
|------------------|--------------------------------------------------------|
| diagnosis_code   | string of format "xx.xx" containing   specific letters |
| icd_version_type | number: 1,2 or -1                                      |


Procedure: Dimension table containing procedure code information.

| Column name         | Column logic                             |
|---------------------|------------------------------------------|
| procedure_code      | number in range of 00100 - 99999; unique |
| procedure_code_desc | static string                            |
| product_group       | value from list: group1, group2…group5   |


Events: Fact table containing patient claims events.

| Column name          | Column logic                                                                      |
|----------------------|-----------------------------------------------------------------------------------|
| claim_id             | unique_id; range from 0-20000; id length - 5                                      |
| provider_id          | provider_id from provider table                                                   |
| patient_id           | patient_id from patient table                                                     |
| procedure_code       | procedure_code from procedure table                                               |
| diagnosis_code       | diagnosis code from diagnosis table                                               |
| event_date           | date between 2019-01-01 and 2021-01-01                                            |
| record_creation_date | Date; One day after the event_date                                                |
| copay_amt            | Integer between 0-500                                                             |
<!-- #endregion -->

In [2]:
import yaml
from data_fabricator.v1.nodes.hydra import hydra_instantiate_dictionary

yaml_string = """
tables:
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: patient
  num_rows: 10
  columns:
    patient_id:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
      prefix: pt_
      id_start_range: 0
      id_end_range: 5000
      id_length: 10
      _metadata_: {
        "description": 'unique ID prefixed with "pt_"'
      }
    patient_gender:
      _target_: data_fabricator.v1.core.mock_generator.ValuesFromSamples
      sample_values: ["m", "f", "u"]
      _metadata_: {
        "description": 'could contain one of the 3 values: m, f, u'
      }
    birth_year:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: year
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'year'
      }
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: provider
  num_rows: 10
  columns:
    provider_id:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
      prefix: phys_
      id_start_range: 0
      id_end_range: 1000
      id_length: 10
      _metadata_: {
        "description": 'unique id; prefixed with "phys"'
      }
    first_name:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: pystr_format
      provider_args:
        string_format: "?????????"
        letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'String value'
      }
    last_name:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: pystr_format
      provider_args:
        string_format: "?????????"
        letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'String value'
      }
    state:
      _target_: data_fabricator.v1.core.mock_generator.ValuesFromSamples
      sample_values: ["MA", "CA", "NY", "CT", "DC", "IL"]
      _metadata_: {
        "description": 'String value from a particular set'
      }
    zip:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: postcode
      localisation: en_US
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'US zip codes'
      }
    speciality_code:
      _target_: data_fabricator.v1.core.mock_generator.ValuesFromSamples
      sample_values:
        - Cardiology
        - Anesthesiology
        - Dermatology
        - Gastroenterology
        - Pulmonology
        - Urology
        - Neurology
        - Immunology
        - Ophthalmology
      _metadata_: {
        "description": 'Values from a list of specialty'
      }
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: diagnosis
  num_rows: 10
  columns:
    diagnosis_code:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: pystr_format
      provider_args:
        string_format: "?##.##"
        letters: "CIJFGK"
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'string of format "xx.xx" containing specific letters'
      }
    icd_version_type:
      _target_: data_fabricator.v1.core.mock_generator.ValuesFromSamples
      sample_values: [-1,1,2]
      dtype: Int64
      _metadata_: {
        "description": 'number: 1,2 or -1'
      }
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: procedure
  num_rows: 10
  columns:
    procedure_code:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
      id_start_range: 100
      id_end_range: 99999
      id_length: 5
      _metadata_: {
        "description": 'number in range of 00100 - 99999; unique'
      }
    procedure_code_desc:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: pystr_format
      provider_args:
        string_format: "?????????"
        letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'static string'
      }
    product_group:
      _target_: data_fabricator.v1.core.mock_generator.ValuesFromSamples
      sample_values: ["group1","group2","group3","group4","group5"]
      _metadata_: {
        "description": 'value from list: group1, group2…group5'
      }
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: event
  num_rows: 10
  columns:
    procedure_code:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
      id_start_range: 0
      id_end_range: 20000
      id_length: 5
      _metadata_: {
        "description": 'unique_id; range from 0-20000; id length - 5'
    }
    provider_id:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: "provider.provider_id"
      row_func: 'lambda x: x'
      _metadata_: {
        "description": 'provider_id from provider table'
    }
    patient_id:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: "patient.patient_id"
      row_func: 'lambda x: x'
      _metadata_: {
        "description": 'patient_id from patient table'
    }
    procedure_code:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: "procedure.procedure_code"
      row_func: 'lambda x: x'
      _metadata_: {
        "description": 'procedure_code from procedure table'
    }
    diagnosis_code:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: "diagnosis.diagnosis_code"
      row_func: 'lambda x: x'
      _metadata_: {
        "description": 'diagnosis code from diagnosis table'
    }
    event_date:
      _target_: data_fabricator.v1.core.mock_generator.Date
      start_dt: "2019-01-01"
      end_dt: "2021-01-1"
      freq: "M"
      _metadata_: {
        "description": 'date between 2019-01-01 and 2021-01-01'
    }
    record_creation_date:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: "event.event_date"
      row_func: 'lambda x: x + datetime.timedelta(days=1)'
      _metadata_: {
        "description": 'Date; One day after the event_date'
    }
    copay_amt :
      _target_: data_fabricator.v1.core.mock_generator.RandomNumbers
      start_range: 0
      end_range: 500
      dtype: Int64
      _metadata_: {
        "description": 'Integer between 0-500'
    }
"""
config = yaml.safe_load(yaml_string)

# function to return injected objects

tables = hydra_instantiate_dictionary(config)

The data fabricator configuration will look like:

In [3]:
print(yaml_string)


tables:
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: patient
  num_rows: 10
  columns:
    patient_id:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
      prefix: pt_
      id_start_range: 0
      id_end_range: 5000
      id_length: 10
      _metadata_: {
        "description": 'unique ID prefixed with "pt_"'
      }
    patient_gender:
      _target_: data_fabricator.v1.core.mock_generator.ValuesFromSamples
      sample_values: ["m", "f", "u"]
      _metadata_: {
        "description": 'could contain one of the 3 values: m, f, u'
      }
    birth_year:
      _target_: data_fabricator.v1.core.mock_generator.Faker
      provider: year
      # Setting seed is not recommended for general use, please consider when to use seed
      faker_seed: 1
      _metadata_: {
        "description": 'year'
      }
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: provider
  num_rows: 10
  columns:
    provider_id:
      _target_: dat

The data will look like:

In [4]:
from data_fabricator.v1.core.mock_generator import MockDataGenerator
from tabulate import tabulate

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(tables=tables["tables"], seed=1)
mock_generator.generate_all()

for table_name in mock_generator.tables:
    df = mock_generator.tables[table_name].dataframe
    print(f"Table: {table_name}")
    print(tabulate(df, headers=df.columns, tablefmt="psql"))
    print("\n")

Table: patient
+----+--------------+------------------+--------------+
|    | patient_id   | patient_gender   |   birth_year |
|----+--------------+------------------+--------------|
|  0 | pt_0000516   | m                |         1979 |
|  1 | pt_0000965   | m                |         2008 |
|  2 | pt_0001100   | u                |         2021 |
|  3 | pt_0001719   | f                |         1974 |
|  4 | pt_0002089   | u                |         1987 |
|  5 | pt_0003109   | m                |         1978 |
|  6 | pt_0003682   | f                |         2003 |
|  7 | pt_0003868   | u                |         2021 |
|  8 | pt_0004058   | m                |         2000 |
|  9 | pt_0004662   | u                |         2002 |
+----+--------------+------------------+--------------+


Table: provider
+----+---------------+--------------+-------------+---------+-------+-------------------+
|    | provider_id   | first_name   | last_name   | state   |   zip | speciality_code   |
|--