# Data Preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

pd.set_option('display.max_rows', None)

# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# Import dataset
# data = pd.read_csv('/content/drive/MyDrive/Erdos Bootcamp May 2022/Project/CoverMyMeds Project/pharmacy_tx.csv')
data = pd.read_csv('C:/Users/diego/Google Drive/Erdos Bootcamp May 2022/Project/CoverMyMeds Project/data.csv')

In [3]:
# Fill all NaN with NA. They are all located in PCN and GROUP columns. Keep NA as a different category.
data = data.fillna('NA')

In [4]:
# Set type of all features to save memory
data = data.astype({'tx_date':'datetime64[ns]', 
                    'month_name': 'category', 
                    'day_name': 'category', 
                    'pharmacy': 'category', 
                    'bin_pcn_group': 'category', 
                    'bin': 'category', 
                    'pcn': 'category', 
                    'group': 'category',
                    'drug_brand': 'category',
                    'drug_name': 'category',
                    'diag_letter': 'category',
                    'diag_num1': 'category',
                    'diag_num2': 'category',
                    'patient_pay': 'float32'
                   })

In [5]:
data = data.drop(columns=['patient_pay'])
data = data.drop_duplicates()

# Classification Model Building

## Import Libraries

In [6]:
import warnings
warnings.simplefilter(action='ignore')

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier

from joblib import dump, load

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from itertools import product

import time

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Functions

In [7]:
def train_test_split_feature_selection(data, dep_feature, test_size=0.2, random_state=614):
    # Separate X and y from the dataframe that has all columns
    X = data.drop(columns=[dep_feature])
    y = data[dep_feature]
    
    # Split train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [48]:
def cross_validation_mean_scores(model, X, y, sampling, curves):

#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
#     X_train = X_train.reset_index(drop=True)
#     X_val = X_val.reset_index(drop=True)
#     y_train = y_train.reset_index(drop=True)
#     y_val = y_val.reset_index(drop=True)

    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    
    metrics = {'neg_log_loss':[], 'accuracy':[], 'precision':[], 'recall':[], 'f1':[], 'roc_auc':[], 'ap':[]}
    estimators = []
    
    X_parts = np.array_split(X, 5)
    for i in range(5):
        X_val_temp = X.loc[list(X_parts[i].index)]
        y_val_temp = y.loc[list(X_parts[i].index)]
        
        for j in range(5):
            if j == 0:
                X_train_temp = pd.DataFrame()
                y_train_temp = pd.Series()
            if j != i:
                X_train_temp = X_train_temp.append(X.loc[list(X_parts[j].index)])
                y_train_temp = y_train_temp.append(y.loc[list(X_parts[j].index)])
        
        if sampling == 'over':
            X_train_temp, y_train_temp = over_sampling(X_train_temp, y_train_temp)
        elif sampling == 'under':
            X_train_temp, y_train_temp = under_sampling(X_train_temp, y_train_temp)
        
        model.fit(X_train_temp, y_train_temp)
        y_pred_val = model.predict(X_val_temp)
        y_prob_val = model.predict_proba(X_val_temp)[:,-1]
        
        neg_log_loss = -1*log_loss(y_val_temp, y_pred_val)
        accuracy = accuracy_score(y_val_temp, y_pred_val)
        precision = precision_score(y_val_temp, y_pred_val)
        recall = recall_score(y_val_temp, y_pred_val)
        f1 = f1_score(y_val_temp, y_pred_val)
        roc_auc = roc_auc_score(y_val_temp, y_prob_val)
        ap = average_precision_score(y_val_temp, y_prob_val)
        
        metrics['neg_log_loss'].append(neg_log_loss)
        metrics['accuracy'].append(accuracy)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['f1'].append(f1)
        metrics['roc_auc'].append(roc_auc)
        metrics['ap'].append(ap)
        
        estimators.append(model)
        
        if curves:
            roc_curve_plot(model, X_val_temp, y_val_temp)
            prec_recall_curve_plot(model, X_val_temp, y_val_temp)
        
    # Calculate the positive mean of the metrics
    metrics_mean = list()
    for metric in metrics.keys():
        if list(metrics.keys()).index(metric) == 0:
            best_estimator = estimators[np.nanargmax(np.array(metrics[metric]))]
        metrics_mean.append(np.nanmean(np.array(metrics[metric])))
    
    # Save the best estimator
    save_path = ['./models/', str(best_estimator).split('(')[0], '_', str(metrics_mean[0])]
    dump(best_estimator, ''.join(save_path + ['.joblib']))
    
    return best_estimator, metrics_mean

In [9]:
def scatterplot(x, y, title, xlabel, ylabel, show=True):
    plt.figure(figsize=(14,7))
    sns.scatterplot(x=x, y=y)
    plt.title(title)
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    if show:
        plt.show()

In [10]:
def plot_residuals(model, pred, resid):
    title = 'Residual plot of {}'.format(model)
    xlabel = 'y_predicted'
    ylabel = 'y_observed - y_predicted'
    scatterplot(pred, resid, title, xlabel, ylabel)

In [11]:
def plot_pred_obs(y_test, model, pred):
    title = 'Predicted vs Observed plot of {}'.format(model)
    xlabel = 'observed'
    ylabel = 'predicted'
    scatterplot(y_test, pred, title, xlabel, ylabel, show=False)
    plt.plot(y_test, y_test, 'b-')
    plt.show()

In [12]:
def shap_features_plot(model, X):
    # Barplot of the shap values if available
    try:
        explainer = shap.explainers.Tree(model)
        shap_values = explainer.shap_values(X)
        shap.plots.bar(shap.Explanation(shap_values))
    except:
        pass

In [13]:
def feature_importance(model):
    # Print feature importance if available
    try:
        feature_importance = ['{}: {}'.format(f,i) for f,i in zip(model.feature_name_, model.feature_importances_)]
        print('Feature Importance: ' + ', '.join(feature_importance))
    except:
        pass

In [37]:
def model_train(X, y, metrics, model, sampling, curves):
    # Call cross-validation and take the total processing time of it
    t0 = time.time()
    best_estimator, metrics_mean = cross_validation_mean_scores(model, X, y, sampling, curves)
    t1 = time.time() - t0
    
    # Print a summary of the metrics for the best fitted estimator
    print('-> Model: {}'.format(best_estimator))
    model_summary = ['Train {}: {:.4f}'.format(m.replace('neg_',''),s) for m,s in zip(metrics, metrics_mean)] + ['Proc Time: {:.2f}'.format(t1)]
    print(', '.join(model_summary))

    return best_estimator

In [15]:
def model_pred(model, X, y):
    # Predict y
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:,-1]
    
    # Calculate scores
    neg_log_loss = -1*log_loss(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_prob)
    ap = average_precision_score(y, y_prob)
    
    # Print MSE and MAPE
    print('Test NegLogLoss {:.4f}, Test Accuracy {:.4f}, Test Precision {:.4f}, Test Recall {:.4f}, Test F1 {:.4f}, Test ROC_AUC {:.4f}, Test AP {:.4f}'.format(neg_log_loss,
                                                                                                                                                 accuracy, 
                                                                                                                                                 precision, 
                                                                                                                                                 recall,
                                                                                                                                                 f1,
                                                                                                                                                 roc_auc, 
                                                                                                                                                 ap))

In [16]:
def conf_matrix(model, X, y):
#     y_actual = pd.Series(y.copy(), name='Actual')
#     y_pred = pd.Series(model.predict(X), name='Predicted')
#     df_confusion = pd.crosstab(y_actual, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
#     print(df_confusion)
    
    conf_mat = np.array(confusion_matrix(y, model.predict(X)))
    print('Confusion Matrix with test dataset into the best model: \n\t\tApproved\tRejected\nApproved\t{} \t\t{} \nRejected\t{} \t\t{} \n'.format(conf_mat[0,0], conf_mat[0,1], conf_mat[1,0], conf_mat[1,1]))

In [17]:
def over_sampling(X, y, random_state=614):
    X_ros, y_ros = RandomOverSampler(random_state=random_state).fit_resample(X, y)
    # X_sm, y_sm = SMOTE(random_state=random_state, n_jobs=-1).fit_resample(X, y)
    return X_ros, y_ros

In [18]:
def under_sampling(X, y, random_state=614):
    X_rus, y_rus = RandomUnderSampler(random_state=random_state).fit_resample(X, y)
    return X_rus, y_rus

In [46]:
def roc_curve_plot(model, X, y):
    y_pred = model.predict_proba(X)[:,-1]
    false_pos_rate, true_pos_rate, proba = roc_curve(y, y_pred)
    plt.figure()
    plt.plot([0,1], [0,1], linestyle="--")
    plt.plot(false_pos_rate, true_pos_rate, marker=".", label='AUC: {}'.format(roc_auc_score(y, y_pred)))
    plt.title("ROC Curve")
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    plt.legend(loc="lower right")
    plt.show()

In [47]:
def prec_recall_curve_plot(model, X, y):
    y_pred = model.predict_proba(X)[:,-1]
    precision, recall, proba = precision_recall_curve(y, y_pred)
    plt.figure()
    plt.plot(recall, precision, marker=".", label='AP: {}'.format(average_precision_score(y, y_pred)))
    plt.title("Precision-Recall Curve.")
    plt.ylabel("Precision")
    plt.xlabel("Recall")
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.legend(loc="lower right")
    plt.show()

In [42]:
def model_analysis(X_train, X_test, y_train, y_test, models, metrics, sampling=None, curves=False):
    # Prepare X_train and X_test with one-hot and dummy encoding
    X_train_hot, X_test_hot = pd.get_dummies(X_train), pd.get_dummies(X_test)
    X_train_dum, X_test_dum = pd.get_dummies(X_train, drop_first=True), pd.get_dummies(X_test, drop_first=True)
    
    # Iterate through all models
    for model in models:
        # Model estimator and encoding
        estimator = model['estimator']
        if len(model['params']) != 0:
            estimator.set_params(**model['params'])
        encoding = model['encoding']
        # Perform cross-validation, feature importance, and model prediction according to encoding
        if encoding == 'hot':
            # Cross-validation with train dataset
            best_estimator = model_train(X_train_hot, y_train, metrics, estimator, sampling, curves)

            # Print scores for test dataset
            model_pred(best_estimator, X_test_hot, y_test)
            conf_matrix(best_estimator, X_test_hot, y_test)

            # Print feature importance or shap values if available for the estimator
            feature_importance(best_estimator)
        elif encoding == 'dummy':
            # Cross-validation with train dataset
            best_estimator = model_train(X_train_dum, y_train, metrics, estimator, sampling, curves)

            # Print scores for test dataset
            model_pred(best_estimator, X_test_dum, y_test)
            conf_matrix(best_estimator, X_test_dum, y_test)

            # Print feature importance or shap values if available for the estimator
            feature_importance(best_estimator)
        else:
            # Cross-validation with train dataset
            best_estimator = model_train(X_train, y_train, metrics, estimator, sampling, curves)

            # Print scores for test dataset
            model_pred(best_estimator, X_test, y_test)
            conf_matrix(best_estimator, X_test, y_test)

            # Print feature importance or shap values if available for the estimator
            feature_importance(best_estimator)

            print('\n')

## Fit default estimators

In [22]:
dep_feature = 'rejected'

In [23]:
ind_features = ['month_name', 
                'day_name',
                'pharmacy',
                'bin_pcn_group', 
                'drug_brand', 
                'drug_name', 
                'diag_letter']

In [24]:
random_state = 614

In [25]:
metrics = ['neg_log_loss','accuracy','precision','recall','f1','roc_auc']

In [89]:
# List of dictionaires with models. Keys to be included: name, estimator, encoding, and params. Empty params for default.
class_models = [{'name':'LGBMClassifier', 'estimator':LGBMClassifier(random_state=random_state, n_jobs=-1, importance_type='gain'), 'encoding':None, 
               'params':{}
              },
              {'name':'LogisticRegression', 'estimator':LogisticRegression(random_state=random_state, n_jobs=-1), 'encoding':'dummy', 
               'params':{}
              }]

In [90]:
X_train, X_test, y_train, y_test = train_test_split_feature_selection(data.sample(1000000, random_state=random_state), dep_feature)

In [91]:
X_train = X_train[ind_features].copy().reset_index(drop=True)
X_test = X_test[ind_features].copy().reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### No resample

In [92]:
model_analysis(X_train, X_test, y_train, y_test, class_models, metrics, sampling=None)

-> Model: LGBMClassifier(importance_type='gain', random_state=614)
Train log_loss: -2.8983, Train accuracy: 0.9161, Train precision: 0.2000, Train recall: 0.0000, Train f1: 0.0001, Train roc_auc: 0.8965, Proc Time: 7.13
Test NegLogLoss -2.8793, Test Accuracy 0.9166, Test Precision 0.0000, Test Recall 0.0000, Test F1 0.0000, Test ROC_AUC 0.8963, Test AP 0.3168
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	183327 		2 
Rejected	16671 		0 

Feature Importance: month_name: 1014.6384798288345, day_name: 400.03614938259125, pharmacy: 15056.776073217392, bin_pcn_group: 69894.48554372787, drug_brand: 53862.84447526932, drug_name: 610249.0320568085, diag_letter: 2674.774471640587


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -2.8989, Train accuracy: 0.9161, Train precision: 0.3865, Train recall: 0.0003, Train f1: 0.0007, Train roc_auc: 0.8887, Proc Time: 45.49
Test NegLogLoss -2.8798, Test Accuracy 0.9166, Test Precision 0.2222

### Under sampling

In [93]:
model_analysis(X_train, X_test, y_train, y_test, class_models, metrics, sampling='under')

-> Model: LGBMClassifier(importance_type='gain', random_state=614)
Train log_loss: -7.7990, Train accuracy: 0.7742, Train precision: 0.2696, Train recall: 0.9894, Train f1: 0.4237, Train roc_auc: 0.8938, Proc Time: 5.51
Test NegLogLoss -7.7982, Test Accuracy 0.7742, Test Precision 0.2681, Test Recall 0.9880, Test F1 0.4218, Test ROC_AUC 0.8931, Test AP 0.3094
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	138374 		44955 
Rejected	200 		16471 

Feature Importance: month_name: 1052.0977656841278, day_name: 220.11667943000793, pharmacy: 12393.505387187004, bin_pcn_group: 43982.16089183092, drug_brand: 37124.52215510607, drug_name: 354397.4075938463, diag_letter: 1121.7583729624748


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -7.6904, Train accuracy: 0.7773, Train precision: 0.2639, Train recall: 0.9237, Train f1: 0.4105, Train roc_auc: 0.8866, Proc Time: 14.87
Test NegLogLoss -7.7015, Test Accuracy 0.7770, Test Precision

### Over sampling

In [94]:
model_analysis(X_train, X_test, y_train, y_test, class_models, metrics, sampling='over')

-> Model: LGBMClassifier(importance_type='gain', random_state=614)
Train log_loss: -7.6719, Train accuracy: 0.7779, Train precision: 0.2729, Train recall: 0.9898, Train f1: 0.4279, Train roc_auc: 0.8966, Proc Time: 15.24
Test NegLogLoss -7.6464, Test Accuracy 0.7786, Test Precision 0.2722, Test Recall 0.9893, Test F1 0.4269, Test ROC_AUC 0.8961, Test AP 0.3152
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	139231 		44098 
Rejected	178 		16493 

Feature Importance: month_name: 3089.8508553504944, day_name: 651.6564726829529, pharmacy: 65386.47308254242, bin_pcn_group: 464145.2348036766, drug_brand: 383395.19847917557, drug_name: 3901507.930141926, diag_letter: 12724.14035320282


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -7.6876, Train accuracy: 0.7774, Train precision: 0.2639, Train recall: 0.9236, Train f1: 0.4105, Train roc_auc: 0.8870, Proc Time: 93.84
Test NegLogLoss -7.6842, Test Accuracy 0.7775, Test Precision 

## Model Tuning

### Under sampling

In [96]:
# List of dictionaires with models. Keys to be included: name, estimator, encoding, and params.
all_params = list(product(*[['gbdt', 'dart'], [600, 900], [0.5, 0.1], [248], [6]]))
for params in all_params:
    class_models = [{'name':'LGBMClassifier', 'estimator':LGBMClassifier(random_state=random_state, n_jobs=-1, importance_type='gain'), 'encoding':None, 
                   'params':{'boosting_type':params[0],
                             'n_estimators':params[1],
                             'learning_rate':params[2],
                             'num_leaves':params[3],
                             'max_depth':params[4]
                            }
                      }]
    model_analysis(X_train, X_test, y_train, y_test, class_models, metrics, sampling='under')

all_params = list(product(*[['lbfgs', 'sag', 'saga'], [0.01, 0.1, 1.0]]))
for params in all_params:
    class_models = [{'name':'LogisticRegression', 'estimator':LogisticRegression(random_state=random_state, n_jobs=-1), 'encoding':'dummy', 
                       'params':{'solver':params[0], 
                                 'C':params[1]}
                      }]

    model_analysis(X_train, X_test, y_train, y_test, class_models, metrics, sampling='under')

-> Model: LGBMClassifier(importance_type='gain', learning_rate=0.5, max_depth=6,
               n_estimators=600, num_leaves=248, random_state=614)
Train log_loss: -7.1975, Train accuracy: 0.7916, Train precision: 0.2735, Train recall: 0.8958, Train f1: 0.4191, Train roc_auc: 0.8827, Proc Time: 22.25
Test NegLogLoss -7.1619, Test Accuracy 0.7926, Test Precision 0.2732, Test Recall 0.8959, Test F1 0.4187, Test ROC_AUC 0.8826, Test AP 0.2848
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	143594 		39735 
Rejected	1736 		14935 

Feature Importance: month_name: 9145.591007133946, day_name: 3562.293183497637, pharmacy: 27996.34331912233, bin_pcn_group: 26280.366894334555, drug_brand: 9375.606495789947, drug_name: 99816.40008369938, diag_letter: 5380.387713730801


-> Model: LGBMClassifier(importance_type='gain', max_depth=6, n_estimators=600,
               num_leaves=248, random_state=614)
Train log_loss: -7.4710, Train accuracy: 0.7837, Train precisio

-> Model: LogisticRegression(C=0.1, n_jobs=1, random_state=614, solver='sag')
Train log_loss: -7.6726, Train accuracy: 0.7779, Train precision: 0.2638, Train recall: 0.9202, Train f1: 0.4101, Train roc_auc: 0.8860, Proc Time: 24.30
Test NegLogLoss -7.6667, Test Accuracy 0.7780, Test Precision 0.2626, Test Recall 0.9196, Test F1 0.4085, Test ROC_AUC 0.8851, Test AP 0.2971
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	140276 		43053 
Rejected	1341 		15330 



-> Model: LogisticRegression(n_jobs=1, random_state=614, solver='sag')
Train log_loss: -7.6904, Train accuracy: 0.7773, Train precision: 0.2638, Train recall: 0.9236, Train f1: 0.4104, Train roc_auc: 0.8866, Proc Time: 38.02
Test NegLogLoss -7.6947, Test Accuracy 0.7772, Test Precision 0.2625, Test Recall 0.9241, Test F1 0.4088, Test ROC_AUC 0.8858, Test AP 0.2972
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	140039 		43290 
Rejected	1266 		15405 



-> 

## Fit best estimators

In [31]:
# List of dictionaires with models. Keys to be included: name, estimator, encoding, and params.
best_estimators = [{'name':'LGBMClassifier', 'estimator':LGBMClassifier(random_state=random_state, n_jobs=-1, importance_type='gain'), 'encoding':None, 
                   'params':{'boosting_type':'dart',
                             'n_estimators':900,
                             'learning_rate':0.1,
                             'num_leaves':248,
                             'max_depth':6
                            }
                  },
                  {'name':'LogisticRegression', 'estimator':LogisticRegression(random_state=random_state, n_jobs=-1), 'encoding':'dummy', 
                   'params':{}
                  }]

### 1 million sample dataset and keeping all features

In [50]:
X_train, X_test, y_train, y_test = train_test_split_feature_selection(data.sample(1000000, random_state=random_state), dep_feature)

In [51]:
X_train = X_train[ind_features].copy().reset_index(drop=True)
X_test = X_test[ind_features].copy().reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### No resample

In [53]:
model_analysis(X_train, X_test, y_train, y_test, best_estimators, metrics, sampling=None, curves=False)

-> Model: LGBMClassifier(boosting_type='dart', importance_type='gain', max_depth=6,
               n_estimators=900, num_leaves=248, random_state=614)
Train log_loss: -2.9018, Train accuracy: 0.9160, Train precision: 0.2875, Train recall: 0.0008, Train f1: 0.0017, Train roc_auc: 0.8952, Proc Time: 573.05
Test NegLogLoss -2.8835, Test Accuracy 0.9165, Test Precision 0.2045, Test Recall 0.0005, Test F1 0.0011, Test ROC_AUC 0.8950, Test AP 0.3131
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	183294 		35 
Rejected	16662 		9 

Feature Importance: month_name: 21101.392351688846, day_name: 5845.179598968476, pharmacy: 125054.6878692942, bin_pcn_group: 286234.68174724374, drug_brand: 135243.38745558498, drug_name: 1530427.1331846751, diag_letter: 22727.01460572006


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -2.8989, Train accuracy: 0.9161, Train precision: 0.3865, Train recall: 0.0003, Train f1: 0.0007, Train roc_auc: 0.888

### Under sampling

In [54]:
model_analysis(X_train, X_test, y_train, y_test, best_estimators, metrics, sampling='under', curves=False)

-> Model: LGBMClassifier(boosting_type='dart', importance_type='gain', max_depth=6,
               n_estimators=900, num_leaves=248, random_state=614)
Train log_loss: -7.6471, Train accuracy: 0.7786, Train precision: 0.2730, Train recall: 0.9849, Train f1: 0.4274, Train roc_auc: 0.8907, Proc Time: 140.64
Test NegLogLoss -7.6227, Test Accuracy 0.7793, Test Precision 0.2722, Test Recall 0.9840, Test F1 0.4264, Test ROC_AUC 0.8898, Test AP 0.3006
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	139456 		43873 
Rejected	266 		16405 

Feature Importance: month_name: 14322.705016475637, day_name: 3681.534614039585, pharmacy: 89538.4095121636, bin_pcn_group: 195074.48230042693, drug_brand: 99801.60242862481, drug_name: 920342.7632730878, diag_letter: 13826.668092693202


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -7.6904, Train accuracy: 0.7773, Train precision: 0.2639, Train recall: 0.9237, Train f1: 0.4105, Train roc_auc: 0.

### Over sampling

In [55]:
model_analysis(X_train, X_test, y_train, y_test, best_estimators, metrics, sampling='over', curves=False)

-> Model: LGBMClassifier(boosting_type='dart', importance_type='gain', max_depth=6,
               n_estimators=900, num_leaves=248, random_state=614)
Train log_loss: -7.4809, Train accuracy: 0.7834, Train precision: 0.2762, Train recall: 0.9753, Train f1: 0.4304, Train roc_auc: 0.8954, Proc Time: 1084.51
Test NegLogLoss -7.4728, Test Accuracy 0.7836, Test Precision 0.2751, Test Recall 0.9761, Test F1 0.4293, Test ROC_AUC 0.8951, Test AP 0.3139
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	140456 		42873 
Rejected	398 		16273 

Feature Importance: month_name: 70609.42877195752, day_name: 19375.43375968933, pharmacy: 495098.029482109, bin_pcn_group: 1834416.882346809, drug_brand: 930252.9187456628, drug_name: 9884308.670326069, diag_letter: 91926.23599145561


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -7.6876, Train accuracy: 0.7774, Train precision: 0.2639, Train recall: 0.9236, Train f1: 0.4105, Train roc_auc: 0.88

### 1 million sample dataset and keeping only plan-, drug-, and diagnosis-related features

In [56]:
ind_features = ['bin_pcn_group', 
                'drug_brand', 
                'drug_name', 
                'diag_letter']

In [57]:
X_train, X_test, y_train, y_test = train_test_split_feature_selection(data.sample(1000000,random_state=random_state), dep_feature)

In [58]:
X_train = X_train[ind_features].copy().reset_index(drop=True)
X_test = X_test[ind_features].copy().reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### No resample

In [59]:
model_analysis(X_train, X_test, y_train, y_test, best_estimators, metrics, sampling=None, curves=False)

-> Model: LGBMClassifier(boosting_type='dart', importance_type='gain', max_depth=6,
               n_estimators=900, num_leaves=248, random_state=614)
Train log_loss: -2.9025, Train accuracy: 0.9160, Train precision: 0.3032, Train recall: 0.0010, Train f1: 0.0021, Train roc_auc: 0.8972, Proc Time: 559.81
Test NegLogLoss -2.8811, Test Accuracy 0.9166, Test Precision 0.3333, Test Recall 0.0007, Test F1 0.0014, Test ROC_AUC 0.8963, Test AP 0.3149
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	183305 		24 
Rejected	16659 		12 

Feature Importance: bin_pcn_group: 251528.8164871902, drug_brand: 134858.32882594474, drug_name: 1502017.317035113, diag_letter: 24329.940329660894


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -2.8991, Train accuracy: 0.9161, Train precision: 0.2886, Train recall: 0.0004, Train f1: 0.0008, Train roc_auc: 0.8888, Proc Time: 27.15
Test NegLogLoss -2.8790, Test Accuracy 0.9166, Test Precision 0.0000, 

### Under sampling

In [60]:
model_analysis(X_train, X_test, y_train, y_test, best_estimators, metrics, sampling='under', curves=False)

-> Model: LGBMClassifier(boosting_type='dart', importance_type='gain', max_depth=6,
               n_estimators=900, num_leaves=248, random_state=614)
Train log_loss: -7.6474, Train accuracy: 0.7786, Train precision: 0.2740, Train recall: 0.9932, Train f1: 0.4295, Train roc_auc: 0.8937, Proc Time: 129.20
Test NegLogLoss -7.6303, Test Accuracy 0.7791, Test Precision 0.2731, Test Recall 0.9929, Test F1 0.4283, Test ROC_AUC 0.8928, Test AP 0.3058
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	139264 		44065 
Rejected	118 		16553 

Feature Importance: bin_pcn_group: 168490.51611702738, drug_brand: 99559.1972856368, drug_name: 899558.080871395, diag_letter: 15202.892036322592


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -7.6845, Train accuracy: 0.7775, Train precision: 0.2639, Train recall: 0.9231, Train f1: 0.4105, Train roc_auc: 0.8868, Proc Time: 9.89
Test NegLogLoss -7.6935, Test Accuracy 0.7773, Test Precision 0.2624,

### Over sampling

In [61]:
model_analysis(X_train, X_test, y_train, y_test, best_estimators, metrics, sampling='over', curves=False)

-> Model: LGBMClassifier(boosting_type='dart', importance_type='gain', max_depth=6,
               n_estimators=900, num_leaves=248, random_state=614)
Train log_loss: -7.6304, Train accuracy: 0.7791, Train precision: 0.2743, Train recall: 0.9922, Train f1: 0.4298, Train roc_auc: 0.8970, Proc Time: 1134.07
Test NegLogLoss -7.6167, Test Accuracy 0.7795, Test Precision 0.2734, Test Recall 0.9929, Test F1 0.4288, Test ROC_AUC 0.8963, Test AP 0.3152
Confusion Matrix with test dataset into the best model: 
		Approved	Rejected
Approved	139344 		43985 
Rejected	119 		16552 

Feature Importance: bin_pcn_group: 1748485.522760203, drug_brand: 926456.7632331903, drug_name: 9761797.66451547, diag_letter: 109391.95908555994


-> Model: LogisticRegression(n_jobs=1, random_state=614)
Train log_loss: -7.6853, Train accuracy: 0.7775, Train precision: 0.2639, Train recall: 0.9228, Train f1: 0.4104, Train roc_auc: 0.8870, Proc Time: 59.26
Test NegLogLoss -7.6742, Test Accuracy 0.7778, Test Precision 0.262