In [16]:
import pandas as pd
import numpy as np
import pymssql
import yaml
from yaml import Loader

In [17]:
with open('secrets.yaml', 'r') as f:
    configs = yaml.load(f, Loader=Loader)

In [21]:

server = configs['data']['server']
user = configs['data']['user']
password = configs['data']['password']
database = configs['data']['database']

# define table strings
efficiency_table = 'dbo.EfficiencyScores'
safety_table = 'dbo.SafetyScores'
outcomes_table = 'dbo.ClinicalOutcomeScores'
community_table = 'dbo.EngagementScores'
payment_table = 'dbo.PaymentAndValueOfCareVals'
state_table = 'dbo.States'
location_table = 'dbo.Locations'

try:
    # connect to database with above credentials
    conn = pymssql.connect(server, user, password, database)
    
    # instantiate cursor
    cursor = conn.cursor()
    
    # get efficiency data
    efficiency_query = f'SELECT * FROM {efficiency_table}'
    efficiency = pd.read_sql(efficiency_query, conn, index_col='Efficiency_ID')
    
    # get safety data
    safety_query = f'SELECT * FROM {safety_table}'
    safety = pd.read_sql(safety_query, conn, index_col='Safety_ID')
    
    # get outcomes data
    outcomes_query = f'SELECT * FROM {outcomes_table}'
    outcomes = pd.read_sql(outcomes_query, conn, index_col='ClinicalOutcome_ID')
    
    # get community data
    community_query = f'SELECT * FROM {community_table}'
    community = pd.read_sql(community_query, conn, 'EngagementScore_ID')
    
    #get payment data
    payment_query = f'SELECT * FROM {payment_table}'
    payment = pd.read_sql(payment_query, conn, index_col='Payment_ID')
    
    #get state data
    state_query = f'SELECT * FROM {state_table}'
    state = pd.read_sql(state_query, conn, index_col='State_ID')
    
    #get location data
    location_query = f'SELECT * FROM {location_table}'
    location = pd.read_sql(location_query, conn, index_col='Facility_ID')
except Exception as e:
    print(e)
    
payment.head()



Unnamed: 0_level_0,Facility_ID,Payment,Lower_Estimate,Higher_Estimate,Payment_Category,Value_Of_Care_Category
Payment_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,20018,13461.0,11689.5,15405.5,-0.5,-0.5
2,30064,20773.25,18978.25,22704.0,0.25,0.0
3,100140,18872.6667,17511.3333,20323.3333,-0.333333,0.0
4,100320,18230.0,16388.5,20246.5,0.0,0.0
5,110071,18356.0,15727.0,20892.0,0.0,0.0


In [28]:
joined = efficiency.merge(safety, on='Facility_ID', how='inner')
joined_1  = joined.merge(outcomes, on='Facility_ID', how='inner')
joined_2 = joined_1.merge(community, on='Facility_ID', how='inner')
joined_3 = joined_2.merge(location, on='Facility_ID', how='left')
joined_4 = joined_3.merge(state, on='State_ID', how='left')
final_join = payment.merge(joined_4, on='Facility_ID', how='inner')
final_join.head()

Unnamed: 0,Facility_ID,Payment,Lower_Estimate,Higher_Estimate,Payment_Category,Value_Of_Care_Category,MSPB_Baseline,MSPB_Performance,HAI_Baseline,HAI_Performance,...,Discharge_Info_Perform,Overall_Rating_Baseline,Overall_Rating_Perform,Facility_Name,Location_City,Location_State,Location_Zip_Code,Location_County,State_ID,State_Name
0,30064,20773.25,18978.25,22704.0,0.25,0,0.968294,0.975835,0.617,0.733,...,85.326103,65.936401,67.765297,BANNER - UNIVERSITY MEDICAL CENTER TUCSON CAMPUS,TUCSON,AZ,85724,PIMA,4.0,Arizona
1,100140,18872.6667,17511.3333,20323.3333,-0.333333,0,0.963651,0.959826,0.153,0.313,...,87.005402,78.330299,78.791901,BAPTIST MEDICAL CENTER - NASSAU,FERNANDINA BEACH,FL,32034,NASSAU,12.0,Florida
2,100320,18230.0,16388.5,20246.5,0.0,0,0.961516,0.896499,0.553,0.238,...,79.810799,64.692001,63.960098,POINCIANA MEDICAL CENTER,KISSIMMEE,FL,34758,OSCEOLA,12.0,Florida
3,140082,23510.5,21425.0,25796.5,0.75,0,1.144005,1.063702,0.714,1.036,...,82.674301,58.5882,60.445801,LOUIS A WEISS MEMORIAL HOSPITAL,CHICAGO,IL,60640,COOK,17.0,Illinois
4,140124,20733.0,18230.6667,23381.0,-0.333333,0,0.913778,0.98152,0.867,0.733,...,80.776604,66.25,64.392799,JOHN H STROGER JR HOSPITAL,CHICAGO,IL,60612,COOK,17.0,Illinois


In [29]:
final_join.dtypes
dropped = final_join.drop(['Facility_ID', 'Facility_Name', 'Location_City', 'Location_State', 'Location_Zip_Code', 'Location_County', 'State_Name'], axis=1)

In [30]:
# label encode categorical columns appropriatelyl
dropped['Payment_Category'] = dropped['Payment_Category'].astype('category').cat.codes
dropped['Value_Of_Care_Category'] = dropped['Value_Of_Care_Category'].astype('category').cat.codes

In [37]:
dropped.dtypes

Payment                     float64
Lower_Estimate              float64
Higher_Estimate             float64
Payment_Category               int8
Value_Of_Care_Category         int8
MSPB_Baseline               float64
MSPB_Performance            float64
HAI_Baseline                float64
HAI_Performance             float64
MORT_AMI_Baseline           float64
MORT_AMI_Perform            float64
MORT_HF_Baseline            float64
MORT_HF_Perform             float64
MORT_PN_Baseline            float64
MORT_PN_Perform             float64
COMP_HIP_KNEE_Baseline      float64
COMP_HIP_KNEE_Perform       float64
Comm_Nurses_Baseline        float64
Comm_Nurses_Perform         float64
Comm_Doctors_Baseline       float64
Comm_Doctors_Perform        float64
Comm_Hospital_Baseline      float64
Comm_Hospital_Perform       float64
Care_Transition_Baseline    float64
Care_Transition_Perform     float64
Comm_Medicines_Baseline     float64
Comm_Medicines_Perform      float64
Clean_Quiet_Baseline        

#### Payment Estimates Model

In [32]:
# ml imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import joblib

In [33]:
X = dropped[['Lower_Estimate', 'Higher_Estimate']]
y = dropped['Payment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lnrg = LinearRegression()
lnrg = lnrg.fit(X_train, y_train)
y_preds = lnrg.predict(X_test)
print(r2_score(y_test, y_preds))

0.9998155914442716


In [34]:
# saving/exporting model
estimates_model = joblib.dump(lnrg, 'estimates_model.model')

#### Linear Regression Payment Model

In [40]:
X = dropped.drop(['Payment', 'Higher_Estimate', 'Lower_Estimate'], axis=1)
y = dropped['Payment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
lnrg = LinearRegression()
lnrg = lnrg.fit(X_train, y_train)
y_preds = lnrg.predict(X_test)
print(r2_score(y_test, y_preds))

0.4863452935040169


#### Lasso Regression Payment Model

In [42]:
from sklearn.linear_model import Lasso

la_params = {'alpha': [0.0001,0.001,0.01,0.1,1,10,100]}

larg = Lasso()
la_grid = GridSearchCV(larg, la_params, scoring='r2', cv=5)
la_grid.fit(X_train, y_train)
y_preds = la_grid.predict(X_test)
print(r2_score(y_test, y_preds))

0.4873230031637571


#### Random Forest Regressor Model

In [43]:
from sklearn.ensemble import RandomForestRegressor

rf_params = { "n_estimators"      : [10,20,30],
              "max_features"      : ['sqrt', 'log2'],
              "min_samples_split" : [2,4,8],
              "bootstrap": [True, False],
            }

rfrg = RandomForestRegressor()
rf_grid = GridSearchCV(rfrg, rf_params, scoring='r2', cv=5)
rf_grid.fit(X_train, y_train)
y_preds = rf_grid.predict(X_test)
print(r2_score(y_test, y_preds))

0.7061155998042232


#### XGBoost Model

In [46]:
from xgboost import XGBRegressor

xg_params = {"learning_rate": (0.05, 0.10, 0.15),
            "max_depth": [ 3, 4, 5, 6, 8],
            "min_child_weight": [ 1, 3, 5, 7],
            "gamma":[ 0.0, 0.1, 0.2],
            "colsample_bytree":[ 0.3, 0.4],}

xgrg = XGBRegressor()
xg_grid = GridSearchCV(xgrg, xg_params, scoring='r2', cv=5)
xg_grid.fit(X_train, y_train)
y_preds = xg_grid.predict(X_test)
print(r2_score(y_test, y_preds))

0.7993413586203597


In [48]:
xg_grid.best_params_

{'colsample_bytree': 0.4,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 7}

In [None]:
# shippin' it
joblib.dump(xg_grid, '')