In [2]:
import pandas as pd
import numpy as np
import src.scripts as src
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, \
                            recall_score, accuracy_score, precision_score, confusion_matrix, \
                            f1_score, roc_auc_score

In [34]:
def run_random_forest(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y)
    
    clf = RandomForestClassifier(n_jobs=-1)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_test, y_pred

def plot_roc(y_test, y_pred):
    fpr, tpr, threshold = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    # method I: plt
    plt.title('Random Forest Classification')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
def random_sample(X, y, size=1000000, minority_ratio=0.3):
    # add y to X
    X['label'] = y
    
    # Isolate minority & majority class
    X_minority = X[y].reset_index(drop=True)
    X_majority = X[~y].reset_index(drop=True)

    minority_idx = np.random.randint(low=0,high=minority.shape[0], size=round(size*minority_ratio))
    majority_idx = np.random.randint(low=0,high=majority.shape[0], size=round(size*(1-minority_ratio)))
    
    over_samp = X_minority.iloc[minority_idx]
    under_samp = X_majority.iloc[majority_idx]
    
    X_adj = pd.concat([over_samp, under_samp], axis=0).reset_index(drop=True)
    y_adj = X_adj.pop('label')
    
    return X_adj, y_adj

def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

def conf_mat(y_test, y_pred):
    y_test = pd.Series(y_test, name='Actual')
    y_pred = pd.Series(y_pred, name='Predicted')
    return pd.crosstab(y_test, y_pred, ).T

In [4]:
# import df vertical
df_vertical = src.read_from_efs('df_final_vertical.csv')
src.print_info(df_vertical, 'df_vertical')
df_vertical.tail()

  mask |= (ar1 == a)


df_vertical pd.DataFrame shape: (35062693, 10)
df_vertical pd.DataFrame size: 2840.08 Mb


Unnamed: 0,npi,generic_name,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,cost_per_bene,cost_per_claim,label
35062688,1992999882,LAMOTRIGINE,10.0,16,16.0,367,108.87,10.887,6.804375,False
35062689,1992999882,QUETIAPINE FUMARATE,10.0,15,15.0,426,360.38,36.038,24.025333,False
35062690,1992999882,RISPERIDONE,10.0,13,15.0,432,172.3,17.23,13.253846,False
35062691,1992999882,SERTRALINE HCL,13.0,23,23.0,640,110.68,8.513846,4.812174,False
35062692,1992999882,TRAZODONE HCL,16.0,21,23.0,632,209.33,13.083125,9.968095,False


In [3]:
# import df excluded_npi
excluded_npi = src.read_from_efs('excluded_npi_list.csv')
src.print_info(excluded_npi, 'excluded_npi')
excluded_npi.tail()

excluded_npi pd.DataFrame shape: (1148, 4)
excluded_npi pd.DataFrame size: 0.05 Mb


Unnamed: 0,NPI,EXCLTYPE,EXCLDATE,REINDATE
1143,1285742619,1128a1,20160320,0
1144,1891879003,1128b4,20160218,0
1145,1871523852,1128a4,20161020,0
1146,1558366815,1128a1,20160320,0
1147,1174561708,1128b4,20161220,0


In [23]:
# Build X for a certain feature
feature = 'total_drug_cost'

features = ['bene_count', 
            'total_claim_count',
            'total_30_day_fill_count',
            'total_day_supply',
            'total_drug_cost',
            'cost_per_bene',
            'cost_per_claim']


X = df_vertical.pivot(index='npi', columns='generic_name', values=feature).reset_index().fillna(0)
src.print_info(X, 'X')
X.tail()



X pd.DataFrame shape: (1043196, 952)
X pd.DataFrame size: 7944.98 Mb


generic_name,npi,0.9 % SODIUM CHLORIDE,ABACAVIR SULFATE,ABACAVIR SULFATE/LAMIVUDINE,ABACAVIR/DOLUTEGRAVIR/LAMIVUDI,ABACAVIR/LAMIVUDINE/ZIDOVUDINE,ABIRATERONE ACETATE,ACAMPROSATE CALCIUM,ACARBOSE,ACEBUTOLOL HCL,...,ZAFIRLUKAST,ZALEPLON,ZIDOVUDINE,ZIPRASIDONE HCL,ZIPRASIDONE MESYLATE,ZOLEDRONIC ACID,ZOLMITRIPTAN,ZOLPIDEM TARTRATE,ZONISAMIDE,ZOSTER VACCINE LIVE/PF
1043191,1992999817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1043192,1992999825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1043193,1992999866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1043194,1992999874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1043195,1992999882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
# Build y
y = X['npi'].isin(excluded_npi['NPI'])

NameError: name 'X' is not defined

In [48]:
del df_vertical

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y)  
clf = RandomForestClassifier(n_jobs=-1) 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

plot_roc(y_test, y_pred)

In [27]:
conf_mat(y_test, y_pred)

Actual,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,41833,42


In [45]:
# data w/ sampling
X_adj, y_adj = random_sample(X, y, size=1000000, minority_ratio=.003)

X_train, X_test, y_train, y_test = train_test_split(X_adj, y_adj, test_size=.2, stratify=y_adj)  
clf = RandomForestClassifier(n_jobs=-1) 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

plot_roc(y_test, y_pred)

MemoryError: 

In [None]:
conf_mat(y_test, y_pred)

In [None]:
# perfrom grid search
param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 20]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_auc_score': make_scorer(roc_auc_score)
}

grid_search_wrapper(refit_score='recall_score')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
