In [1]:
import numpy as np
import pandas as pd

In [2]:
# data downloaded from: https://meps.ahrq.gov/mepsweb/data_files/pufs/h192ssp.zip
# documentation on this data: 
# https://meps.ahrq.gov/data_stats/download_data/pufs/h192/h192doc.shtml#Utilization25112
df = pd.read_sas('h192.ssp',format='xport')

In [3]:
utilization_vars = ['OBTOTV16', 'OBCHIR16', 'OBNURS16', 'OBOPTO16', 'OBASST16', 'OBTHER16',\
                   'OPTOTV16', 'AMCHIR16', 'AMNURS16', 'AMOPTO16', 'AMASST16', 'AMTHER16',\
                   'ERTOT16', 'IPDIS16', 'IPNGTD16', 'DVTOT16', 'HHTOTD16', 'RXTOT16']


categorical_features = ['REGION53','RACEV2X','HISPANX','MARRY53X','ACTDTY53','HONRDC53',\
                        'LANGSPK','FILEDR16','PREGNT53','WLKLIM53','WLKDIF53','AIDHLP53',\
                        'SOCLIM53','COGLIM53','WRGLAS42','EMPST53','MORJOB53','OCCCT53H','INDCT53H']

quantitative_features = ['AGE53X','EDUCYR','HIDEG','FAMINC16','RTHLTH53','MNHLTH53','NOINSTM']

In [4]:
# data cleaning
# remove unneeded variables
df = df[categorical_features+quantitative_features+utilization_vars]
# convert to integer (all values in these features should be integers; floating point error creates nonintegers)
df = df.astype(int)
# remove entries with missing values for the response
df = df[(df[utilization_vars]>=0).all(1)]
# remove entries with missing values for features - should be nonnegative, or -1 (inapplicable)
df = df[(df[categorical_features+quantitative_features]>=-1).all(1)]
# for quantitative features, set -1 (inapplicable) to 0
df[quantitative_features] = df[quantitative_features] + (df[quantitative_features]==-1)

In [5]:
# convert categorical features into dummy variables (one for each level - not removing a reference level)
df = pd.get_dummies(df,columns=categorical_features,prefix=categorical_features)
# extract features X and transformed response Y
X_meps = np.array(df.drop(utilization_vars,axis=1))
Y_meps = np.log(1+np.array(df[utilization_vars]).sum(1))
# write to file
data_meps = np.c_[X_meps,Y_meps]
np.savetxt('meps_data.txt',data_meps)