In [33]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
from datetime import date
import pickle
from fancyimpute import KNN, IterativeImputer
import statsmodels.api as sm
from sklearn import __version__ as sklearn_version
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [35]:
model_pkl_file = 'encounters_pricing_model.pkl'
model = pickle.load(open(model_pkl_file, 'rb'))

In [37]:
df = pd.read_csv('synthea_knn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ETHNICITY,GENDER,INCOME,AGE,encounters_cost,num_encounters,meds_cost,num_meds,num_procedures,...,Total score [DAST-10],Total score [HARK],Triglycerides,Urea nitrogen [Mass/volume] in Blood,asian,black,hawaiian,native,white,STATE_POPULATION
0,0,0,1,21478,3,8748.8,4.0,0.0,0.0,17.0,...,0.854571,0.0,116.707043,12.746857,0,0,0,0,1,4903185.0
1,1,0,1,89823,42,4659.91,5.0,14.9,2.0,13.0,...,1.0,0.0,120.9,18.7,0,0,0,0,1,4903185.0
2,2,0,1,54400,23,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,134.514315,9.724996,0,0,0,0,1,4903185.0
3,3,0,1,35376,16,2817.62,4.0,0.0,0.0,11.0,...,1.281077,0.0,119.167174,16.79521,0,1,0,0,0,4903185.0
4,4,0,1,93837,34,4301.42,2.0,290.14,2.0,12.0,...,1.0,0.0,116.6,12.084413,0,0,0,0,1,4903185.0


In [39]:
df = df.drop('Unnamed: 0', axis = 1)

In [41]:
df.shape

(5000, 62)

In [43]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ETHNICITY,5000.0,1.202000e-01,3.252278e-01,0.0,0.000000e+00,0.000,0.000000e+00,1.0
GENDER,5000.0,4.998000e-01,5.000500e-01,0.0,0.000000e+00,0.000,1.000000e+00,1.0
INCOME,5000.0,8.746290e+04,1.318461e+05,34.0,2.681975e+04,54104.500,9.508025e+04,986892.0
AGE,5000.0,3.786860e+01,2.351651e+01,-1.0,1.800000e+01,37.000,5.600000e+01,109.0
encounters_cost,5000.0,1.482596e+04,3.459232e+04,0.0,1.075923e+03,4434.865,1.035666e+04,709730.3
...,...,...,...,...,...,...,...,...
black,5000.0,1.128000e-01,3.163798e-01,0.0,0.000000e+00,0.000,0.000000e+00,1.0
hawaiian,5000.0,1.680000e-02,1.285343e-01,0.0,0.000000e+00,0.000,0.000000e+00,1.0
native,5000.0,1.960000e-02,1.386351e-01,0.0,0.000000e+00,0.000,0.000000e+00,1.0
white,5000.0,7.898000e-01,4.074913e-01,0.0,1.000000e+00,1.000,1.000000e+00,1.0


In [45]:
X = df.drop(columns = ['encounters_cost'])
y = df['encounters_cost']

In [47]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [50]:
model.fit(X_scaled, y)

In [56]:
cv_results = cross_validate(model, X_scaled, y, cv = 5)
print('CV results (R squared values):', cv_results['test_score'])
print(f'Mean R squared: {cv_results['test_score'].mean():.4f}')
print(f'Standard deviation R squared: {cv_results['test_score'].std():.4f}')

CV results (R squared values): [0.71843321 0.72302038 0.73377076 0.66367913 0.48537573]
Mean R squared: 0.6649
Standard deviation R squared: 0.0930


In [68]:
cv_results = cross_validate(model, X_scaled, y, scoring='neg_mean_absolute_error', cv = 5)
print('CV results (MAE):', -1 * cv_results['test_score'])
print(f'Mean MAE: {-1 * cv_results['test_score'].mean():.4f}')
print(f'Standard deviation MAE: {cv_results['test_score'].std():.4f}')

CV results (MAE): [7597.89578125 6261.1425065  5822.57751187 6235.2418065  6999.87486713]
Mean MAE: 6583.3465
Standard deviation MAE: 633.4932
