In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

Imports the cleaned data set (from HealthCost_data_cleaning notebook)

In [None]:
df=pd.read_csv('data/IPPS_Data_Clean.csv',dtype={'provider_id':str,'provider_zip_code':str, 'drg_id':str})
df.head(3)

In [None]:
len(df)

Calculates the national median cost, median payments and median medicare payments for each procedure

In [None]:
natmed = pd.DataFrame(df.groupby('drg_id',sort=False)['average_covered_charges','average_total_payments', \
                                                       'average_medicare_payments'].median()).reset_index()
natmed = natmed.rename(columns={'average_covered_charges':'median_covered_charges', \
                                 'average_total_payments':'median_total_payments', \
                              'average_medicare_payments':'median_medicare_payments' })
natmed.head(3)

In [None]:
natmed.set_index('drg_id').plot(kind='bar',figsize = (25,12),color=('r','b','g'),fontsize=14);

Adds columns that are the fractional difference in total charges, payments and medicare payments from the national median (for that particular procedure)

In [None]:
procedures=df.drg_id.unique()

for i in procedures:    
    sel = df.drg_id == i
    med = natmed.loc[natmed.drg_id == i,natmed.columns[1:]].values[0].tolist()
    
    df.loc[sel,'charge_frac_diff'] = df.loc[sel,'average_covered_charges']/med[0] - 1.0
    df.loc[sel,'payment_frac_diff'] = df.loc[sel,'average_total_payments']/med[1] - 1.0
    df.loc[sel,'medicare_frac_diff'] = df.loc[sel,'average_medicare_payments']/med[2] - 1.0
    
df.head(3)

GPS locations of every unique provider

In [None]:
dfprovgeo=pd.read_csv('data/Providers_Geocode.csv',dtype={'provider_id':str})
dfprovgeo.head(3)

In [None]:
import gmaps

A heatmap of the provider locations in the US.

In [None]:
gmaps.configure(api_key="AIzaSyBy4EZiP9bUn-onVRdtu4dlipT39hSdAVw")

data = [dfprovgeo.lat,dfprovgeo.lng]
zip(*data)
data =[list(x) for x in zip(*data)]

m = gmaps.Map()
m.add_layer(gmaps.Heatmap(data=data))
m

In [None]:
print('Number of procedures:', df.drg_id.nunique())
print(' Number of providers:', df.provider_id.nunique())
print('Number of discharges:', df.total_discharges.sum())

Number of each procedure by state.

In [None]:
dfg = df.groupby(['drg_id','provider_state']).size()
dfg = dfg.unstack('provider_state').fillna(0)
dfg.head()

The total number for each procedure.

In [None]:
dfg.sum(1).plot(ylim=0,kind='bar',figsize=(25,8),fontsize=14);

The total number of procedures in each state

In [None]:
dfg.sum().plot(ylim=0,kind='bar',figsize=(25,8),fontsize=16);

Plot of average covered charges and average total payments for procedure 039 for all providers.

In [None]:
dfp=df[df.drg_id == '039']
pid=np.array(dfp.provider_id.astype(int))

plt.figure(figsize = (25,12))
plt.scatter(pid,dfp.average_total_payments)
plt.scatter(pid,dfp.average_covered_charges,color='r')
plt.xlim([0,700000])
plt.xlabel('provider_id',fontsize=20)
plt.ylim([0,160000])

plt.title('Average covered charges and average total payments for procedure 039 for all providers with data',fontsize=20)
plt.legend();

Fractional difference of covered cost compared to national median for two providers by procedure

In [None]:
prov = df.provider_id.unique()

for i in [0,1]:

    dfprov = df[df.provider_id == prov[i]]
    provider=dfprov.provider_name.iloc[0]+', '+dfprov.provider_city.iloc[0]+', '+dfprov.provider_state.iloc[0]
    print('Number of procedures: ',len(dfprov))
    dfprov[['drg_id','charge_frac_diff']].set_index('drg_id').plot(kind='bar',figsize=(25,12),title=provider)

Normalized histograms of the fractional differences from the national median for all providers.

In [None]:
fig, ax = plt.subplots()
plt.xlim([-2,6])

df.hist(['charge_frac_diff'],bins=30, ax=ax, color='r',alpha=.5,normed=True)
df.hist(['payment_frac_diff'],bins=30, ax=ax,color='b',alpha=.5,normed=True)
df.hist(['medicare_frac_diff'],bins=30, ax=ax,color='g',alpha=.5,normed=True)

ax.set_title('')
ax.legend(['charge_frac_diff','payment_frac_diff','medicare_frac_diff']);

Adds classification columns that show whether the each quantity is above or below the national median.

In [None]:
df['charge_frac_diff_class'] = df.charge_frac_diff >= 0.0
df['payment_frac_diff_class'] = df.payment_frac_diff >= 0.0
df['medicare_frac_diff_class'] = df.medicare_frac_diff >= 0.0

Totals up the number of procedures above or below the national median for each provider, and adds a classification column showing whether there are more procedures above or below.

In [None]:
inclass = 'charge_frac_diff_class'
df_frac = df[['provider_id',inclass]].groupby(['provider_id',inclass]).size()

df_frac = df_frac.unstack(inclass).fillna(0.0)
df_frac['frac_class'] = df_frac[df_frac.columns[1]] > df_frac[df_frac.columns[0]]
df_frac.head()

Merges the previous dataframe with the providers geocode dataframe in order to plot the locations of which providers have more procedures above or below the national median. This also adds a column that is the alpha value used when plotting. The darker the point on the plot, the larger the percentage of procedures are above (or below) the national median.

In [None]:
df_frac = df_frac.merge(dfprovgeo.set_index('provider_id'),left_index=True,right_index=True)

df_frac.loc[df_frac.frac_class == True,'alpha'] = df_frac[df_frac.columns[1]]/np.sum(df_frac[df_frac.columns[0:2]],axis=1)
df_frac.loc[df_frac.frac_class == False,'alpha'] = df_frac[df_frac.columns[0]]/np.sum(df_frac[df_frac.columns[0:2]],axis=1)
df_frac.alpha = 2*(df_frac.alpha - 0.5)
df_frac.loc[df_frac.alpha == 0.0,'alpha'] = 0.01

df_frac.head()

Plots of the provider locations where the blue points are providers that have more procedures below the national median and red points are the providers with more procedures above.

In [None]:
label = ['More procedures below national median','More procedures above national median']

sel1 = (df_frac.frac_class == False)# & (df_frac.alpha < .2)
b_colors = np.zeros((len(df_frac[sel1]),4))
b_colors[:,2] = 1.0
b_colors[:, 3] = df_frac[sel1].alpha
                    
sel2 = (df_frac.frac_class == True)# & (df_frac.alpha < .2)
r_colors = np.zeros((len(df_frac[sel2]),4))
r_colors[:,0] = 1.0
r_colors[:, 3] = df_frac[sel2].alpha

fig , (ax1,ax2,ax3) = plt.subplots(3,1,figsize=(14,26))

xlim=[-180,-60]
ylim=[10,70]

ax1.scatter(df_frac[sel1].lng,df_frac[sel1].lat, color=b_colors,edgecolors='none')
ax1.set_xlim(xlim)
ax1.set_ylim(ylim)
ax1.legend([label[0]])

ax2.scatter(df_frac[sel2].lng,df_frac[sel2].lat, color=r_colors,edgecolors='none')
ax2.set_xlim(xlim)
ax2.set_ylim(ylim)
ax2.legend([label[1]])

ax3.scatter(df_frac[sel1].lng,df_frac[sel1].lat, color=b_colors,edgecolors='none')
ax3.scatter(df_frac[sel2].lng,df_frac[sel2].lat, color=r_colors,edgecolors='none')
ax3.set_xlim(xlim)
ax3.set_ylim(ylim)
ax3.legend(label)

print('Number of providers with more procedures below:',len(df_frac[sel1]))
print('Number of providers with more procedures above:',len(df_frac[sel2]))

Determine the fractional difference in covered cost from the national median for each provider averaged over all procedures

In [None]:
fracmn = pd.DataFrame(df.groupby('provider_id',sort=False)['charge_frac_diff','payment_frac_diff', \
                                                          'medicare_frac_diff'].mean()).reset_index()
fracmn = fracmn.rename(columns={'charge_frac_diff':'charge_frac_diff_mean', \
                                'payment_frac_diff':'payment_frac_diff_mean', \
                                'medicare_frac_diff':'medicare_frac_diff_mean'})
fracmn.head()

In [None]:
plt.figure(figsize = (25,12))
plt.xlim([0,700000])
plt.ylim([-1,5])
plt.xlabel('provider_id',fontsize=20)
plt.title('Fractional difference from national median for each provider averaged over all procedures',fontsize=20);

plt.scatter(fracmn.provider_id.astype(int),fracmn.charge_frac_diff_mean)
plt.scatter(fracmn.provider_id.astype(int),fracmn.payment_frac_diff_mean,color='r')
#plt.scatter(fracmn.provider_id.astype(int),fracmn.medicare_frac_diff_mean,color='g')
#plt.axhline(y=0,linestyle='--',color='k')
plt.legend(['charge_frac_diff_mean','payment_frac_diff_mean']);

Since provider ID is grouped by state I labeled each cluster by it's state.

In [None]:
stateid =df[['provider_id','provider_state']].drop_duplicates(subset='provider_state')
stateid = stateid.append({'provider_id':'670000','provider_state':'TX'},ignore_index=True)

plt.figure(figsize = (25,12))
plt.xlim([0,700000])
plt.ylim([-1,5])
plt.xlabel('provider_id',fontsize=20)
plt.title('Fractional difference from national median in total cost for each provider averaged over all procedures (by state)',fontsize=20)
plt.xticks(stateid.provider_id.astype(int), stateid.provider_state, rotation='vertical',fontsize=16)

plt.scatter(fracmn.provider_id.astype(int),fracmn.charge_frac_diff_mean)
plt.scatter(fracmn.provider_id.astype(int),fracmn.payment_frac_diff_mean,color='r')
#plt.axhline(y=0,linestyle='--',color='k')
plt.legend(['charge_frac_diff_mean','payment_frac_diff_mean']);