In [1]:
#Import necessary Python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
#Load the healthcare claim data
claim = pd.read_csv("D:/Study/Business Analytics/LA/2019.3.4/ClaimsData(1).csv")
claim_data = np.array(claim)

In [5]:
claim.head()

Unnamed: 0,age,alzheimers,arthritis,cancer,copd,depression,diabetes,heart.failure,ihd,kidney,osteoporosis,stroke,reimbursement2008,bucket2008,reimbursement2009,bucket2009
0,85,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,59,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,67,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,52,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,67,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [6]:
print(claim_data.shape)

(458005, 16)


In [7]:
##For the covariates, we should remove the 2019 reimbursement amount information
X = claim_data[:,0:-2]
y = claim_data[:,-1]

#### The slicing with indices follows the following principle:

For example, y = claim_data[:,-1] means we want all the rows and the last column.

And,X = claim_data[:,0:-2] means we want all the rows, and the columns from the first to the third to last(i.e. bucket2008)

In [8]:
print(X.shape)
print(y.shape)

(458005, 14)
(458005,)


In [9]:
## Split the data into training (75%) and testing (25%) data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 88)

In [10]:
##Build the classification tree 
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score

#### confusion_matrix(A, B) works the same as the table(A,B) in R

In [12]:
from sklearn.metrics import confusion_matrix
## Baseline model: Use the cost bucket of 2008 directly to predict the outcome of 2019
CM_baseline = confusion_matrix(y_test, X_test[:,13])
print(CM_baseline)
print(accuracy_score(y_test,  X_test[:,13]))

[[68887  4888  2073   919   121]
 [ 9930  6735  2936  1806   360]
 [ 4366  2861  1710  1061   212]
 [ 1719  1224   854   944   242]
 [  188   117   108   180    61]]
0.6841539885766188


In [13]:
#Input the penalty matrix
penalty = np.array([[0,1,2,3,4],[2,0,1,2,3],[4,2,0,1,2],[6,4,2,0,1],[8,6,4,2,0]])

In [14]:
print(penalty)

[[0 1 2 3 4]
 [2 0 1 2 3]
 [4 2 0 1 2]
 [6 4 2 0 1]
 [8 6 4 2 0]]


In [40]:
#Penalty Error for the baseline model
np.sum(np.multiply(CM_baseline,penalty))/np.sum(CM_baseline)

0.7387818553387714

### Use the DecisionTreeClassifier(max_depth=x) as the model

In [15]:
#Build a classification tree to fit the trainin set. The maximum depth of the tree is set to be 4.
tree_D2Hawkeye = DecisionTreeClassifier(max_depth=5)

### fit the model

In [16]:
tree_D2Hawkeye.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## First, open your anaconda prompt, type:conda install graphviz

In [19]:
##Print out the tree

import graphviz
dot_data = export_graphviz(tree_D2Hawkeye,feature_names= ['age','alzheimers','arthritis','cancer','copd','depression','diabetes','heart.failure','ihd','kidney','osteoporosis','stroke','reimbursement2008','bucket2008'], class_names = ['1','2','3','4','5']) 
tree_graph = graphviz.Source(dot_data) 
tree_graph.render("D2Hawkeye") 




CalledProcessError: Command '['dot.bat', '-Tpdf', '-O', 'D2Hawkeye']' returned non-zero exit status 1.

In [19]:
#Make predictions on the testing data set
y_predict = tree_D2Hawkeye.predict(X_test)

In [20]:
#create the confusion matrix on the out-of-sample test
CM_tree = confusion_matrix(y_test, y_predict)
print(CM_tree)

[[72831  4057     0     0     0]
 [13017  8750     0     0     0]
 [ 5705  4505     0     0     0]
 [ 2234  2749     0     0     0]
 [  241   413     0     0     0]]


In [21]:
#The testing overall accuracy
print(accuracy_score(y_test, y_predict))

0.7124853714345601


In [22]:
#Penalty Error for the tree model
np.sum(np.multiply(CM_tree,penalty))/np.sum(CM_tree)

0.7923617054723935

### GridSearchCV as we've talked about in previous notebooks

In [21]:
#Cross-validated Parameter tuning for the maximum depth of the tree
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':range(1, 7)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0),param_grid=param_grid,cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
print('Best Overal Accuracy: %.2f' %grid.best_score_)

{'max_depth': 6}
Best Overal Accuracy: 0.71


## Then we use a random forest classifier

The results are two lists, each element in the list is a score of different numbers of estimators

In [22]:
## Fit a random-forest model with different numbers of trees (estimators) in the forest
from sklearn.ensemble import RandomForestClassifier

train_scores = []
test_scores = []

rf = RandomForestClassifier()
estimator_range = range(1, 100, 5)##Number of trees in the forest

for n_estimators in estimator_range:
    rf.n_estimators = n_estimators
    rf.fit(X_train, y_train)
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))
    
print(train_scores)
print('--------------------------------------')
print(test_scores)

  from numpy.core.umath_tests import inner1d


[0.8449707862813425, 0.9270020931403802, 0.9516132319077272, 0.9575054657455685, 0.9616626346785909, 0.9630425352908126, 0.9640556268795324, 0.9645359720293564, 0.9650687184682522, 0.9652288335181934, 0.9654326163090279, 0.965490839963552, 0.9656014649071478, 0.9656131096380527, 0.9656800668407554, 0.9657062674852912, 0.9657150010334699, 0.9657528464089106, 0.9657441128607319, 0.9657615799570891]
--------------------------------------
[0.6302946673420552, 0.6633071911407661, 0.6612897591308449, 0.6644600094321497, 0.6639796684774065, 0.6654905591168713, 0.6651062863530768, 0.6647831478926133, 0.6654818256449669, 0.6661193690939896, 0.6657612967459083, 0.665691428970673, 0.6656652285549597, 0.6655953607797244, 0.6659971004873277, 0.6664075736668356, 0.6671586522506157, 0.6664163071387399, 0.6663988401949311, 0.6670800510034759]


In [23]:
## The importance of different features for a random forest of 100 trees

rf2=RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
rf2.feature_importances_

array([0.28506076, 0.01509037, 0.01455796, 0.00911884, 0.01251155,
       0.01517363, 0.03491882, 0.02140716, 0.03421163, 0.01537969,
       0.01408039, 0.00904827, 0.46650257, 0.05293836])

## Scaling is similar to fitting a model, the general method is, for example:

### scaler = StandardScaler()

### scaler.fit(Xb_train)

### Xb_train_scaled = scaler.transform(Xb_train)

### You may also use .fit_transform() for convenience

In [49]:
#Load the boston housing data set and necessary models

from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler ##The scaling tool

boston = load_boston()
Xb = boston.data 
yb = boston.target
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    Xb, yb, random_state=0,test_size=0.25)

scaler = StandardScaler()
scaler.fit(Xb_train)

Xb_train_scaled = scaler.transform(Xb_train)
ridge = Ridge().fit(Xb_train_scaled, yb_train)

Xb_test_scaled = scaler.transform(Xb_test)


ridge.score(Xb_test_scaled, yb_test)

0.6345884564889053

## make_pipeline()is similar to a function: it executes all the methods you input into it

In [50]:
#Combine transformation, model fit, and prediction together using pipeline
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), Ridge())
pipe.fit(Xb_train, yb_train)
pipe.score(Xb_test, yb_test)

0.6345884564889053

In [26]:
#Pipeline meets grid-search cross validation to find the best number of neighbors for k-nearest neighbors regressor

from sklearn.neighbors import KNeighborsRegressor

knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
param_grid = {'kneighborsregressor__n_neighbors': range(1, 10)}
grid = GridSearchCV(knn_pipe, param_grid, cv=10)
grid.fit(Xb_train, yb_train)
print(grid.best_params_)
print(grid.score(Xb_test, yb_test))


{'kneighborsregressor__n_neighbors': 7}
0.5999825126971097




In [52]:
##Include the polynomial features when fitting the model
from sklearn.preprocessing import PolynomialFeatures
poly_pipe = make_pipeline(PolynomialFeatures(degree=2), Ridge())
poly_pipe.fit(Xb_train_scaled, yb_train)
poly_pipe.score(Xb_test_scaled, yb_test)

0.6696430114132679

In [53]:
##Imbalancedness of the D2Hawkeye data set
print(np.bincount(y_train))
print(np.bincount(y_test))

[     0 230556  65332  30766  14860   1989]
[    0 76888 21767 10210  4983   654]


In [54]:
##Undersampling

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(replacement=False)
X_train_subsample, y_train_subsample = rus.fit_sample(
    X_train, y_train)
print(X_train.shape)
print(X_train_subsample.shape)
print(np.bincount(y_train_subsample))

(343503, 14)
(9945, 14)
[   0 1989 1989 1989 1989 1989]


In [30]:
##Fit a CART model using the under sampled data
tree_D2Hawkeye.fit(X_train_subsample, y_train_subsample)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
#Examining the effect of undersampling 

y_predict = tree_D2Hawkeye.predict(X_test)
CM_tree_subsample = confusion_matrix(y_test, y_predict)
print(CM_tree_subsample)
print(accuracy_score(y_test, y_predict))

[[49015 21748  4346   777  1002]
 [ 2284 12050  4216   877  2340]
 [ 1182  5080  2085   428  1435]
 [  451  1971  1026   199  1336]
 [   43   212   110    21   268]]
0.5555972821435433


In [55]:
##Oversampling

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train_oversample, y_train_oversample = ros.fit_sample(X_train, y_train)
print(X_train.shape)
print(X_train_oversample.shape)
print(np.bincount(y_train_oversample))

(343503, 14)
(1152780, 14)
[     0 230556 230556 230556 230556 230556]


In [33]:
##Fit the decision tree model with the over-sampled data and examine its effectiveness on the testing data

tree_D2Hawkeye.fit(X_train_oversample, y_train_oversample)
y_predict = tree_D2Hawkeye.predict(X_test)
CM_tree_oversample = confusion_matrix(y_test, y_predict)
print(CM_tree_oversample)
print(accuracy_score(y_test, y_predict))

[[61304 11150  2720   262  1452]
 [ 5421  8980  4125   471  2770]
 [ 2599  3708  2028   253  1622]
 [ 1003  1439  1001   120  1420]
 [  103   156   106    11   278]]
0.6350107421704424


In [34]:
##SMOTE sampling method

from imblearn.over_sampling import SMOTE
smt = SMOTE()

##Standardize the covariates for the training and testing data
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train_scaled, y_train)
print(X_train_scaled.shape)
print(X_train_SMOTE.shape)
print(np.bincount(y_train_SMOTE))



(343503, 14)
(1152780, 14)
[     0 230556 230556 230556 230556 230556]


In [35]:
#Train the decision tree model on the training set with SMOTE and test it over the testing set
tree_D2Hawkeye.fit(X_train_SMOTE, y_train_SMOTE)
y_predict = tree_D2Hawkeye.predict(X_test_scaled)
CM_tree_SMOTE = confusion_matrix(y_test, y_predict)
print(CM_tree_SMOTE)
print(accuracy_score(y_test, y_predict))

[[56956 19256     0     0   676]
 [ 3918 16190     0     0  1659]
 [ 1956  7216     0     0  1038]
 [  732  3182     0     0  1069]
 [   78   351     0     0   225]]
0.6407835670992647
