In [1]:
!pip install joblib



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import sklearn

from joblib import dump, load

In [3]:
print(sklearn.__version__)

0.20.3


# Data Cleaning

In [5]:
# # unzip files
# !unzip Other/Inpatient_Data_2011_CSV.zip
# !unzip Other/Inpatient_Data_2012_CSV.zip
# !unzip Other/Inpatient_Data_2013_CSV.zip
# !unzip Other/Inpatient_Data_2014_CSV.zip
# !unzip Other/Inpatient_Data_2015_CSV.zip
# !unzip Other/Inpatient_Data_2016_CSV.zip

In [8]:
# load data into pandas
top_2011 = pd.read_csv('Medicare_Provider_Charge_Inpatient_DRG100_FY2011.csv')
top_2012 = pd.read_csv('Medicare_Provider_Charge_Inpatient_DRG100_FY2012.csv')
top_2013 = pd.read_csv('Medicare_Provider_Charge_Inpatient_DRG100_FY2013.csv')
og_2014 = pd.read_csv('Medicare_Provider_Charge_Inpatient_DRGALL_FY2014.csv')
og_2015 = pd.read_csv('Medicare_Provider_Charge_Inpatient_DRGALL_FY2015.csv')
og_2016 = pd.read_csv('Medicare_Provider_Charge_Inpatient_DRGALL_FY2016.csv')

In [9]:
# define top100 list
top100 = top_2011['DRG Definition'].unique().tolist()

In [10]:
# get diagnoses from top 100
top_2014 = og_2014[og_2014['DRG Definition'].isin(top100)]
top_2015 = og_2015[og_2015['DRG Definition'].isin(top100)]
top_2016 = og_2016[og_2016['DRG Definition'].isin(top100)]

In [11]:
# add year feature
top_2011['year'] = [2011] * top_2011.shape[0]
top_2012['year'] = [2012] * top_2012.shape[0]
top_2013['year'] = [2013] * top_2013.shape[0]
top_2014['year'] = [2014] * top_2014.shape[0]
top_2015['year'] = [2015] * top_2015.shape[0]
top_2016['year'] = [2016] * top_2016.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [12]:
# 2016 has a '$' in front of `Average Total Payments`
def strip_dollar(entry):
    return float((entry.strip('$')).replace(',', ''))

top_2016['Average Total Payments'] = top_2016['Average Total Payments'].apply(strip_dollar)
top_2016['Average Medicare Payments'] = top_2016['Average Medicare Payments'].apply(strip_dollar)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
# concatenate dataframes
dataframes = [top_2011, top_2012, top_2013, top_2014, top_2015, top_2016]
top100 = pd.concat(dataframes)

In [14]:
# change name
top100['diagnosis'] = top100['DRG Definition']
top100 = top100.drop(columns='DRG Definition')

top100['state'] = top100['Provider State']
top100 = top100.drop(columns='Provider State')

In [15]:
# drop columns
top100_clean = top100.drop(columns=['Provider Id',
                     'Provider Name',
                     'Provider Street Address',
                     'Provider City',
                     'Provider Zip Code',
                     'Hospital Referral Region (HRR) Description',
                     'Total Discharges',
                     'Average Covered Charges'])

In [16]:
# create out-of-pocket payment feature
top100_clean['cost'] = top100_clean['Average Total Payments'] - top100_clean['Average Medicare Payments']

# now drop other payment features
top100 = top100_clean.drop(columns=['Average Total Payments',
                                    'Average Medicare Payments'])

In [17]:
# get rid of beginning
def strip_beginning(entry):
    return str(entry)[6:]

top100['diagnosis'] = top100['diagnosis'].apply(strip_beginning)

In [18]:
# get rid of endings
endings = [' W/O CC/MCC',
           ' W/O MCC',
           ' W MCC',
           ' W CC',
           ' W/O CC',
           ' W CC/MCC']

def strip_endings(entry):
    for ending in endings:
        if entry.endswith(ending):
            return entry.replace(ending, '')
            
top100['diagnosis'] = top100['diagnosis'].apply(strip_endings)

In [19]:
# categorize diagnoses
diagnoses = top100['diagnosis'].unique().tolist()

# neuro
neuro = diagnoses[:3] + diagnoses[4:6]

# respiratory
respiratory = diagnoses[6:11]

# circulatory
circulatory = diagnoses[11:25]

# digestive
digestive = diagnoses[25:33] + diagnoses[44:47]

# orthopedic
orthopedic = diagnoses[33:41] + diagnoses[53:]

In [20]:
def categorize(diagnosis):
    if diagnosis in neuro:
        return 'Neurological'
    elif diagnosis in respiratory:
        return 'Respiratory'
    elif diagnosis in circulatory:
        return 'Circulatory'
    elif diagnosis in digestive:
        return 'Digestive'
    elif diagnosis in orthopedic:
        return 'Orthopedic'
    else:
        return 'Other'

In [21]:
top100['diagnosis'] = top100['diagnosis'].apply(categorize)

In [22]:
top100.head()

Unnamed: 0,year,diagnosis,state,cost
0,2011,Neurological,AL,1013.505494
1,2011,Neurological,AL,810.857143
2,2011,Neurological,AL,981.166666
3,2011,Neurological,AL,1288.4
4,2011,Neurological,AL,806.888889


# Train/Test Split

In [23]:
# train/test split
train = pd.concat([top100[top100['year'] == 2011],
           top100[top100['year'] == 2012],
           top100[top100['year'] == 2013],
           top100[top100['year'] == 2014],
           top100[top100['year'] == 2015]])

test = top100[top100['year'] == 2016]

In [24]:
# define target and features
target = 'cost'
features = ['diagnosis', 'state']

In [25]:
# X matrices
X_train = train[features]
X_test = test[features]

# y vector
y_train = train[target]
y_test = test[target]

In [26]:
X_test.head()

Unnamed: 0,diagnosis,state
3142,Neurological,AL
3143,Neurological,AL
3144,Neurological,AL
3145,Neurological,AL
3146,Neurological,AL


# Encode and Impute

In [27]:
# # instantiate encoder
# encoder = OrdinalEncoder()
# X_train_encoded = encoder.fit_transform(X_train)
# X_test_encoded = encoder.transform(X_test)

# # dump encoder
# dump(encoder, 'medicare_encoder.joblib')

# # instantiate imputer
# imputer = SimpleImputer(np.nan, strategy='median')
# X_train = pd.DataFrame(imputer.fit_transform(X_train_encoded), columns = X_train_encoded.columns.values)
# X_test = pd.DataFrame(imputer.fit_transform(X_test_encoded), columns = X_test_encoded.columns.values)

# # fill missing y-values
y_train = y_train.fillna(y_train.median())
y_test = y_test.fillna(y_test.median())

In [28]:
X_test.describe()

Unnamed: 0,diagnosis,state
count,127035,127035
unique,6,51
top,Digestive,CA
freq,31336,10187


# RF

In [29]:
# define error metric
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [30]:
pipeline = make_pipeline(OrdinalEncoder(),
                         SimpleImputer(),
                         RandomForestRegressor())

In [31]:
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('ordinalencoder', OrdinalEncoder(cols=['diagnosis', 'state'], drop_invariant=False,
        handle_missing='value', handle_unknown='value',
        mapping=[{'col': 'diagnosis', 'mapping': Neurological    1
Other           2
Respiratory     3
Circulatory     4
Digestive       5
Orthopedic   ...ators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [32]:
y_pred = pipeline.predict(X_test)

In [33]:
print("MAE:", mean_absolute_error(y_test, y_pred))

MAE: 681.0657434829487


In [34]:
def predict(diagnosis, state):
    df = pd.DataFrame(
        columns=['diagnosis', 'state'], 
        data=[[diagnosis, state]]
    )
    y_pred = pipeline.predict(df)[0]
    return f'${y_pred:.0f}'

In [35]:
predict('Circulatory', 'AL')

'$1188'

## Dump

In [36]:
from joblib import dump, load

In [37]:
dump(pipeline, 'medicare.joblib')

['medicare.joblib']

In [38]:
X_train.head()

Unnamed: 0,diagnosis,state
0,Neurological,AL
1,Neurological,AL
2,Neurological,AL
3,Neurological,AL
4,Neurological,AL
