In [None]:
!sudo bash -c 'curl -L https://github.com/dolthub/dolt/releases/latest/download/install.sh | sudo bash'
!dolt clone dolthub/quest-v3
!dolt sql -q "select billing_code_type, billing_code, billing_code_modifier, reporting_entity_name, negotiated_rate, npi from rate join npi_rate on npi_rate.rate_id = rate.id join code on code.id = rate.code_id join price_metadata on price_metadata.id = rate.price_metadata_id join insurer on insurer.id = rate.insurer_id" -r csv >> outputwnpi.csv

In [44]:
import polars as pl
from polars import col

In [45]:
save_dir = './quest-v3-redux' # change this to './' if file saved in this dir

In [46]:
df = pl.read_csv(f'{save_dir}/outputwnpi.csv', infer_schema_length = 10_000)

In [47]:
df = df.unique()

In [69]:
placeholder_prices = [ # "suspected..."
    999999.99,         # Sierra Health...
    699999.99,         # Blue Cross
    99999.99,          # UMR
    88888.88,          # United, Medica, Oxford
    49999.5,
    39999.6,           # Rocky Mountain Health placeholder value
    8720.0,            # Aetna
    811.0,             # Anthem (?)
    458.0,             # Anthem (?)
    140.0,             # ?
    .01,               # Aetna (?) (internal?)
    .02,               # (?)
    0]

df = df.filter(~col('negotiated_rate').is_in(placeholder_prices))

There's one billing code that consistently comes up as confusingly expensive in this analysis and I'm not sure why. It's a simple blood draw coded CPT 36416 or 36415, and it's usually bundled with other codes, and not billed separately. I'm going to filter it out for the time being.

In [70]:
df = df.filter(~col('billing_code').is_in(['36416', '36415']))

In [71]:
def compute_means_and_ratios(df) -> pl.DataFrame():
    """Compute the mean of each negotiated rate to get a kind of reference value.
    The 'multiplier' is the negotiated_rate/mean."""
    return (df
      .with_column(
          pl.mean('negotiated_rate').over(['billing_code_type', 'billing_code', 'billing_code_modifier']).alias('rate_mean')
      ).with_column(
          (col('negotiated_rate')/col('rate_mean')).alias('multiplier')
      ))

In [72]:
df = compute_means_and_ratios(df)

Let's get rid of any prices that are too low which might be skewing our mean downwards. This makes our analysis more robust -- by making the average price as high as reasonably-is-possible, we can say more confidently that prices that are way higher than this are truly outliers.

In [73]:
df = df.filter(col('multiplier') > .01)

We'll need to compute the means and ratios again.

In [74]:
df = compute_means_and_ratios(df)

Now let's look at hospitals which appear often in this dataset. We'll filter down to rates which are over 20x the average.

In [75]:
(df
 .filter(col('multiplier') > 20) # filter down to the highest negotiated rates
 .select(['npi', 'billing_code_type', 'billing_code', 'billing_code_modifier',])
 .unique()
 ['npi']                         # get just the NPI numbers
 .value_counts()
 .sort('counts')                 # sort by the NPIs that appear most frequently in this set
 [-10:]                          # take just the last 10
)

npi,counts
i64,u32
1831140979,9
1649273434,9
1285798918,9
1699770149,9
1326079534,10
1154618742,10
1881697878,13
1902897820,15
1194706655,47
1780761866,49


Let's make this easier to understand by joining this with NPPES, the database of NPIs with provider information.

In [None]:
!wget https://download.cms.gov/nppes/NPPES_Data_Dissemination_January_2023.zip
!unzip NPPES_Data_Dissemination_January_2023.zip

In [56]:
npi = pl.scan_csv(f'{save_dir}/npidata_pfile_20050523-20230108.csv', infer_schema_length = 10_000)

In [57]:
npi = npi.select(['NPI', 
            'Entity Type Code',
            'Provider Organization Name (Legal Business Name)', 
            'Provider First Line Business Practice Location Address', 
            'Provider Business Practice Location Address City Name', 
            'Provider Business Practice Location Address State Name',
            'Healthcare Provider Taxonomy Code_1',])

In [58]:
npi = npi.collect()

In [88]:
exp_hosps = (df
 .filter(col('multiplier') > 20) # filter down to the highest negotiated rates
 .select(['npi', 'billing_code_type', 'billing_code', 'billing_code_modifier',])
 .unique()
 ['npi']                         # get just the NPI numbers
 .value_counts()
 .sort('counts')                 # sort by the NPIs that appear most frequently in this set
 [-20:]                          # take just the last 10
).join(npi, left_on = 'npi', right_on = 'NPI').sort('counts').rename({'counts': 'number_disinct_codes_gt_20_times_mean_rate'})

In [89]:
print(exp_hosps)

shape: (20, 8)
┌───────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ npi   ┆ number_dis ┆ Entity     ┆ Provider   ┆ Provider   ┆ Provider   ┆ Provider   ┆ Healthcare │
│ ---   ┆ inct_codes ┆ Type Code  ┆ Organizati ┆ First Line ┆ Business   ┆ Business   ┆ Provider   │
│ i64   ┆ _gt_20_tim ┆ ---        ┆ on Name    ┆ Business   ┆ Practice   ┆ Practice   ┆ Taxonomy   │
│       ┆ es...      ┆ str        ┆ (Lega...   ┆ Pra...     ┆ Locat...   ┆ Locat...   ┆ Cod...     │
│       ┆ ---        ┆            ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│       ┆ u32        ┆            ┆ str        ┆ str        ┆ str        ┆ str        ┆ str        │
╞═══════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│ 17806 ┆ 5          ┆ 2          ┆ QUEST DIAG ┆ 33608      ┆ SAN JUAN   ┆ CA         ┆ 291U00000X │
│ 20526 ┆            ┆            ┆ NOSTICS    ┆ ORTEGA HWY ┆ CAPISTRANO ┆  

The last hospital, Havasu Regional, has the highest number of lab tests with a cost ratio of over 20x the mean price. We can look more closely at those rates by filtering down the first dataframe.

In [80]:
exp_npi = exp_hosps[-1]['npi'][0]
df.filter(col('npi') == exp_npi).sort('multiplier')