In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from scipy.stats import linregress as LR
import pandas as pd
from pandas.util.testing import assert_frame_equal
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns",200)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from subprocess import call

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import recall_score

In [49]:
#make a chronic patient table for this subset of conditions

# read in that pre-made table
df=pd.read_csv("../data_files/chronic_patients.csv")
df = df.dropna(axis = 0,how='any')
upsample=True

In [50]:
X = df[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR','PATWT', 'chronic']]
y = df[['received']]

X_train, X_test, y_train, y_test = train_test_split(X, y)

train = pd.concat([X_train, y_train], axis=1, sort=False)

In [51]:
#upsample the opiate-receivers
df_maj = train[train.received==0]
df_min = train[train.received==1]

if upsample:
    df_minority_upsampled = resample(df_min, 
                                     replace=True,     # sample with replacement
                                     n_samples=len(df_maj))    # to match majority class
                                     #random_state=123) # reproducible results
    # Combine majority class with upsampled minority class
    train = pd.concat([df_maj, df_minority_upsampled])
else: #downsample
    df_majority_downsampled = resample(df_maj, 
                                     replace=False,     # sample with replacement
                                     n_samples=len(df_min))    # to match majority class
                                     #random_state=123) # reproducible results
    # Combine minority class with downsampled majority class
    train = pd.concat([df_majority_downsampled, df_min])

 
# Display new class counts
train.received.value_counts()

1    341028
0    341028
Name: received, dtype: int64

In [52]:
X_train = train[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR','PATWT', 'chronic']]
y_train = train[['received']]

In [53]:
model = RandomForestClassifier()

In [54]:
model.fit(X_train[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR']], np.array(y_train),sample_weight=X_train.PATWT.ravel())

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [55]:
y_predict = model.predict(X_test[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR']])#,sample_weight=X_wt2.PATWT.ravel())

In [56]:
accuracy_score(y_test, y_predict,sample_weight=X_test.PATWT.ravel())

0.9679972769850019

In [57]:
#this is with upsampling
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Not Received', 'Predicted Received'],
    index=['True Not Received', 'True Received']
)

Unnamed: 0,Predicted Not Received,Predicted Received
True Not Received,113567,111
True Received,3524,981


In [58]:
accuracy_score(y_test, y_predict,sample_weight=X_test.PATWT.ravel())

0.9679972769850019

In [59]:
recall_score(y_test, y_predict)

0.21775804661487236

In [46]:
#this is with downsampling
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Not Received', 'Predicted Received'],
    index=['True Not Received', 'True Received']
)

Unnamed: 0,Predicted Not Received,Predicted Received
True Not Received,89109,24523
True Received,913,3638


In [47]:
accuracy_score(y_test, y_predict,sample_weight=X_test.PATWT.ravel())

0.7828548868233771

In [48]:
recall_score(y_test, y_predict)

0.7993847506042628

In [6]:
#this is with no upsampling
pd.DataFrame(
    confusion_matrix(y_test[['received']], y_predict),
    columns=['Predicted Not Received', 'Predicted Received'],
    index=['True Not Received', 'True Received']
)

Unnamed: 0,Predicted Not Received,Predicted Received
True Not Received,113616,14
True Received,3697,856


In [7]:
accuracy_score(y_test, y_predict,sample_weight=X_test.PATWT.ravel())

0.9688130620754904

In [8]:
recall_score(y_test, y_predict)

0.18800790687458818