In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from scipy.stats import linregress as LR
import pandas as pd
from pandas.util.testing import assert_frame_equal
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns",200)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from subprocess import call

from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import recall_score

In [2]:
#make a chronic patient table for this subset of conditions

# read in that pre-made table
df=pd.read_csv("../data_files/chronic_patients.csv")
df = df.dropna(axis = 0,how='any')
upsample=True

In [3]:
X = df[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR','PATWT', 'chronic','pain']]
y = df[['received']]

X_train, X_test, y_train, y_test = train_test_split(X, y)

train = pd.concat([X_train, y_train], axis=1, sort=False)

In [4]:
#upsample the opiate-receivers
df_maj = train[train.received==0]
df_min = train[train.received==1]

if upsample:
    df_minority_upsampled = resample(df_min, 
                                     replace=True,     # sample with replacement
                                     n_samples=len(df_maj))    # to match majority class
                                     #random_state=123) # reproducible results
    # Combine majority class with upsampled minority class
    train = pd.concat([df_maj, df_minority_upsampled])
else: #downsample
    df_majority_downsampled = resample(df_maj, 
                                     replace=False,     # sample with replacement
                                     n_samples=len(df_min))    # to match majority class
                                     #random_state=123) # reproducible results
    # Combine minority class with downsampled majority class
    train = pd.concat([df_majority_downsampled, df_min])

 
# Display new class counts
train.received.value_counts()

1    340972
0    340972
Name: received, dtype: int64

In [5]:
# upsample the pain-diagnosed
df_maj = train[train.chronic==0]
df_min = train[train.chronic==1]
df_minority_upsampled = resample(df_min, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_maj),    # to match majority class
                                 random_state=123) # reproducible results
# Combine majority class with upsampled minority class
train = pd.concat([df_maj, df_minority_upsampled])
 
# Display new class counts
train.pain.value_counts()

0    725436
1    419296
Name: pain, dtype: int64

In [6]:
X_train = train[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR','PATWT', 'chronic']]
y_train = train[['received']]

In [7]:
model = LogisticRegression()

In [8]:
model.fit(X_train[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR', 'chronic']], y_train[['received']],sample_weight=X_train.PATWT.ravel())

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
y_predict = model.predict(X_test[['VMONTH', 'AGE', 'SEX', 'RACE', 'ETHNIC', 'RFV1', 'DIAG1', 'DIAG2',
              'DIAG3', 'DIAG4', 'DIAG5', 'MED1', 'PAYTYPE','YEAR', 'chronic']])#,sample_weight=X_wt2.PATWT.ravel())

In [10]:
accuracy_score(y_test, y_predict,sample_weight=X_test.PATWT.ravel())

0.3639760035749859

In [11]:
pd.DataFrame(
    confusion_matrix(y_test[['received']], y_predict),
    columns=['Predicted Not Received', 'Predicted Received'],
    index=['True Not Received', 'True Received']
)

Unnamed: 0,Predicted Not Received,Predicted Received
True Not Received,41090,72644
True Received,995,3454


In [12]:
recall_score(y_test, y_predict)

0.7763542369071702