# Support vector machine (SVM) classifier for prediction of pharmacy claim approval
by Bhargava Nemmaru and Apostolos Zournas

The following program is meant to take drug type and payer information as an input vector and return a variable indicating whether or not the pharmacy claim will be approved. This program uses data provided by CoverMyMeds for model training and testing.

The program is broken into the following sections:
1. Data pre-processing
2. Cross validation tests and prediction
3. Precision recall testing
4. Model storage

In [None]:
# Import the necessary packages: (i) sklearn for machine learning algorithms (ii) pandas for storing and manipulating data
# frames (iii) numpy for creation of arrays and manipulating them (iv) matplotlib for generation of plots (v) sqlite3 to
# work with databases
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

In [20]:
# Read the csv file which contains claims information and display the first few columns
claims = pd.read_csv('data/dim_claims.csv')
print(claims.head)

# Assess the number of rejections under each rejection code
print(claims['reject_code'].value_counts())

<bound method NDFrame.head of          dim_claim_id     bin drug  reject_code  pharmacy_claim_approved
0                   1  417380    A         75.0                        0
1                   2  999001    A          NaN                        1
2                   3  417740    A         76.0                        0
3                   4  999001    A          NaN                        1
4                   5  417740    A          NaN                        1
...               ...     ...  ...          ...                      ...
1335571       1335572  417740    C         75.0                        0
1335572       1335573  999001    C          NaN                        1
1335573       1335574  417380    C         70.0                        0
1335574       1335575  999001    C          NaN                        1
1335575       1335576  999001    C          NaN                        1

[1335576 rows x 5 columns]>
70.0    252206
75.0    217351
76.0     86394
Name: reject_code, d

In [21]:
# Read the csv file which contains prior authorization information and display it
pa = pd.read_csv('data/dim_pa.csv')
pa.head

<bound method NDFrame.head of         dim_pa_id  correct_diagnosis  tried_and_failed  contraindication  \
0               1                  1                 1                 0   
1               2                  1                 0                 0   
2               3                  0                 0                 1   
3               4                  1                 1                 0   
4               5                  0                 1                 0   
...           ...                ...               ...               ...   
555946     555947                  1                 0                 1   
555947     555948                  1                 0                 1   
555948     555949                  1                 1                 1   
555949     555950                  1                 0                 0   
555950     555951                  0                 0                 1   

        pa_approved  
0                 1  
1            

In [22]:
# The bridge file is used to establish a connection between the claims file and the prior authorization file
bridge = pd.read_csv('data/bridge.csv')
bridge.head

<bound method NDFrame.head of          dim_claim_id  dim_pa_id  dim_date_id
0                   1        1.0            1
1                   2        NaN            1
2                   3        2.0            1
3                   4        NaN            1
4                   5        NaN            1
...               ...        ...          ...
1335571       1335572   555950.0         1095
1335572       1335573        NaN         1095
1335573       1335574   555951.0         1095
1335574       1335575        NaN         1095
1335575       1335576        NaN         1095

[1335576 rows x 3 columns]>

In [23]:
# idx is a numpy array meant to retain only those columns with a numeric dim_pa_id and remove NaNs
idx = np.where(bridge['dim_pa_id'].values == bridge['dim_pa_id'].values)
df = bridge.copy()
idx

(array([      0,       2,       9, ..., 1335558, 1335571, 1335573],
       dtype=int64),)

In [24]:
# To cross-check whether the number of rows in pa data frame is equal to those in pa
print(idx[0])
print(len(idx[0]))
print(len(pa))

# We are now going to generate a few arrays all of which show follow a similar assignment - Incorrect assignment (remove it from this code later)
# The next few code chunks are to ensure that we build a data frame that is compatible with other notebooks in this project
# Anything related to pa is not at all relevant to this notebook since we are only concerned about pharmacy claims being approved or rejected
# Hence the following code chunks are just labeled as 'preparing the data frame'
tried_and_failed = np.zeros((len(claims)))
tried_and_failed[:] = np.nan
tried_and_failed[idx[0]] = pa['correct_diagnosis'].values
tried_and_failed = tried_and_failed.reshape(-1,1)

[      0       2       9 ... 1335558 1335571 1335573]
555951
555951


In [25]:
# Preparing the data frame
correct_diagnosis = np.zeros((len(claims)))
correct_diagnosis[:] = np.nan
correct_diagnosis[idx[0]] = pa['correct_diagnosis'].values
correct_diagnosis = correct_diagnosis.reshape(-1,1)

In [26]:
# Preparing the data frame
contraindication = np.zeros((len(claims)))
contraindication[:] = np.nan
contraindication[idx[0]] = pa['contraindication'].values
contraindication = correct_diagnosis.reshape(-1,1)  

In [27]:
# Preparing the data frame
pa_approved = np.zeros((len(claims)))
pa_approved[:] = np.nan
pa_approved[idx[0]] = pa['pa_approved'].values
pa_approved = correct_diagnosis.reshape(-1,1)

In [28]:
# Preparing the data frame
claims['tried_and_failed'] = tried_and_failed
claims['correct_diagnosis'] = correct_diagnosis
claims['contraindication'] = contraindication
claims['pa_approved'] = pa_approved
claims.head()

Unnamed: 0,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved,tried_and_failed,correct_diagnosis,contraindication,pa_approved
0,1,417380,A,75.0,0,1.0,1.0,1.0,1.0
1,2,999001,A,,1,,,,
2,3,417740,A,76.0,0,1.0,1.0,1.0,1.0
3,4,999001,A,,1,,,,
4,5,417740,A,,1,,,,


In [29]:
# Using one-hot encoding for reject code, drug type and payer (or insurance company)
claims_1hot = pd.get_dummies(data=claims, columns=['reject_code', 'drug', 'bin'])
y_data_70 = claims_1hot['reject_code_70.0']
y_data_75 = claims_1hot['reject_code_75.0']
y_data_76 = claims_1hot['reject_code_76.0']
claims_1hot_no_nans = claims_1hot.replace(np.nan, 0)
claims_1hot.head(n=50)

Unnamed: 0,dim_claim_id,pharmacy_claim_approved,tried_and_failed,correct_diagnosis,contraindication,pa_approved,reject_code_70.0,reject_code_75.0,reject_code_76.0,drug_A,drug_B,drug_C,bin_417380,bin_417614,bin_417740,bin_999001
0,1,0,1.0,1.0,1.0,1.0,0,1,0,1,0,0,1,0,0,0
1,2,1,,,,,0,0,0,1,0,0,0,0,0,1
2,3,0,1.0,1.0,1.0,1.0,0,0,1,1,0,0,0,0,1,0
3,4,1,,,,,0,0,0,1,0,0,0,0,0,1
4,5,1,,,,,0,0,0,1,0,0,0,0,1,0
5,6,1,,,,,0,0,0,1,0,0,0,0,1,0
6,7,1,,,,,0,0,0,1,0,0,0,0,0,1
7,8,1,,,,,0,0,0,1,0,0,0,0,1,0
8,9,1,,,,,0,0,0,1,0,0,0,0,0,1
9,10,0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,0


In [30]:
# One hot encoding for reject codes
X_data = claims_1hot_no_nans[['pharmacy_claim_approved', 'tried_and_failed', 'correct_diagnosis', 'contraindication', 'pa_approved', 'drug_A', 'drug_B', 'drug_C']]
y_data_70 = claims_1hot_no_nans['reject_code_70.0']
y_data_75 = claims_1hot_no_nans['reject_code_75.0']
y_data_76 = claims_1hot_no_nans['reject_code_76.0']
print(len(y_data_70), len(y_data_75), len(y_data_76))

1335576 1335576 1335576


In [31]:
# importing packages essential for building, testing and computing performance matrics an SVM based classifier: 
# (i) SVC for building a support vector classifier (ii) kfold for cross-validation (iii) pickle for saving the model
from sklearn.svm import SVC
from joblib import dump, load
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.base import clone
import pickle
import time # For timing the run
import random

In [15]:
claims[claims['pa_approved'] == 1]['reject_code'].value_counts()

# This is where we build the X and y arrays needed for building the model


X_data_cols = ['drug_A', 'drug_B', 'drug_C', 'bin_417380', 'bin_417614', 'bin_417740', 'bin_999001']
X_data_pharm = claims_1hot_no_nans[X_data_cols].values
claims_1hot_no_nans.head()
y_data_pharm = claims_1hot_no_nans['pharmacy_claim_approved'].values

In [16]:
# Exploratory data analysis
# 1. Plot variables one against the other 2. Get the number of each drug that exists within a certain payer bin
# Here is the code for part 2
# To calculate the fraction of drug A that falls under payer bin_417380

# Start off with drug A
count_drug_A = np.where(X_data_pharm[:,0] == 1)

count_drug_A_bin_417380 = np.where(X_data_pharm[count_drug_A[0],3] == 1)
frac_drug_A_bin_417380 = 100*len(count_drug_A_bin_417380[0])/len(count_drug_A[0])
print("The fraction of drug A claims that fall under bin_417380 are %s" % str(frac_drug_A_bin_417380))

count_drug_A_bin_417614 = np.where(X_data_pharm[count_drug_A[0],4] == 1)
frac_drug_A_bin_417614 = 100*len(count_drug_A_bin_417614[0])/len(count_drug_A[0])
print("The fraction of drug A claims that fall under bin_417614 are %s" % str(frac_drug_A_bin_417614))

count_drug_A_bin_417740 = np.where(X_data_pharm[count_drug_A[0],5] == 1)
frac_drug_A_bin_417740 = 100*len(count_drug_A_bin_417740[0])/len(count_drug_A[0])
print("The fraction of drug A claims that fall under bin_417740 are %s" % str(frac_drug_A_bin_417740))

count_drug_A_bin_999001 = np.where(X_data_pharm[count_drug_A[0],6] == 1)
frac_drug_A_bin_999001 = 100*len(count_drug_A_bin_999001[0])/len(count_drug_A[0])
print("The fraction of drug A claims that fall under bin_999001 are %s" % str(frac_drug_A_bin_999001))

# Drug B fractions
count_drug_B = np.where(X_data_pharm[:,1] == 1)

count_drug_B_bin_417380 = np.where(X_data_pharm[count_drug_B[0],3] == 1)
frac_drug_B_bin_417380 = 100*len(count_drug_B_bin_417380[0])/len(count_drug_B[0])
print("The fraction of drug B claims that fall under bin_417380 are %s" % str(frac_drug_B_bin_417380))

count_drug_B_bin_417614 = np.where(X_data_pharm[count_drug_B[0],4] == 1)
frac_drug_B_bin_417614 = 100*len(count_drug_B_bin_417614[0])/len(count_drug_B[0])
print("The fraction of drug B claims that fall under bin_417614 are %s" % str(frac_drug_B_bin_417614))

count_drug_B_bin_417740 = np.where(X_data_pharm[count_drug_B[0],5] == 1)
frac_drug_B_bin_417740 = 100*len(count_drug_B_bin_417740[0])/len(count_drug_B[0])
print("The fraction of drug B claims that fall under bin_417740 are %s" % str(frac_drug_B_bin_417740))

count_drug_B_bin_999001 = np.where(X_data_pharm[count_drug_B[0],6] == 1)
frac_drug_B_bin_999001 = 100*len(count_drug_B_bin_999001[0])/len(count_drug_B[0])
print("The fraction of drug B claims that fall under bin_999001 are %s" % str(frac_drug_B_bin_999001))

# Drug C fractions
count_drug_C = np.where(X_data_pharm[:,2] == 1)

count_drug_C_bin_417380 = np.where(X_data_pharm[count_drug_C[0],3] == 1)
frac_drug_C_bin_417380 = 100*len(count_drug_C_bin_417380[0])/len(count_drug_C[0])
print("The fraction of drug C claims that fall under bin_417380 are %s" % str(frac_drug_C_bin_417380))

count_drug_C_bin_417614 = np.where(X_data_pharm[count_drug_C[0],4] == 1)
frac_drug_C_bin_417614 = 100*len(count_drug_C_bin_417614[0])/len(count_drug_C[0])
print("The fraction of drug C claims that fall under bin_417614 are %s" % str(frac_drug_C_bin_417614))

count_drug_C_bin_417740 = np.where(X_data_pharm[count_drug_C[0],5] == 1)
frac_drug_C_bin_417740 = 100*len(count_drug_C_bin_417740[0])/len(count_drug_C[0])
print("The fraction of drug C claims that fall under bin_417740 are %s" % str(frac_drug_C_bin_417740))

count_drug_C_bin_999001 = np.where(X_data_pharm[count_drug_C[0],6] == 1)
frac_drug_C_bin_999001 = 100*len(count_drug_C_bin_999001[0])/len(count_drug_C[0])
print("The fraction of drug C claims that fall under bin_999001 are %s" % str(frac_drug_C_bin_999001))

The fraction of drug A claims that fall under bin_417380 are 13.026382229497868
The fraction of drug A claims that fall under bin_417614 are 23.038556831247064
The fraction of drug A claims that fall under bin_417740 are 16.012177546030152
The fraction of drug A claims that fall under bin_999001 are 47.922883393224915
The fraction of drug B claims that fall under bin_417380 are 12.945587162654997
The fraction of drug B claims that fall under bin_417614 are 22.957257476294675
The fraction of drug B claims that fall under bin_417740 are 16.056892778993436
The fraction of drug B claims that fall under bin_999001 are 48.04026258205689
The fraction of drug C claims that fall under bin_417380 are 12.972383373253429
The fraction of drug C claims that fall under bin_417614 are 23.008008470927432
The fraction of drug C claims that fall under bin_417740 are 16.003865498512166
The fraction of drug C claims that fall under bin_999001 are 48.01574265730697


In [17]:
# Continuation of exploratory data analysis - Computing the number of claims approved for each of the above twelve categories
# For drug A under different bins
count_drug_A_bin_417380_acc = np.where(y_data_pharm[count_drug_A_bin_417380[0]] == 1)
frac_drug_A_bin_417380_acc = 100*len(count_drug_A_bin_417380_acc[0])/len(count_drug_A_bin_417380[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('A','bin_417380',str(frac_drug_A_bin_417380_acc)))

count_drug_A_bin_417614_acc = np.where(y_data_pharm[count_drug_A_bin_417614[0]] == 1)
frac_drug_A_bin_417614_acc = 100*len(count_drug_A_bin_417614_acc[0])/len(count_drug_A_bin_417614[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('A','bin_417614',str(frac_drug_A_bin_417614_acc)))

count_drug_A_bin_417740_acc = np.where(y_data_pharm[count_drug_A_bin_417740[0]] == 1)
frac_drug_A_bin_417740_acc = 100*len(count_drug_A_bin_417740_acc[0])/len(count_drug_A_bin_417740[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('A','bin_417740',str(frac_drug_A_bin_417740_acc)))

count_drug_A_bin_999001_acc = np.where(y_data_pharm[count_drug_A_bin_999001[0]] == 1)
frac_drug_A_bin_999001_acc = 100*len(count_drug_A_bin_999001_acc[0])/len(count_drug_A_bin_999001[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('A','bin_999001',str(frac_drug_A_bin_999001_acc)))

# For drug B under different bins
count_drug_B_bin_417380_acc = np.where(y_data_pharm[count_drug_B_bin_417380[0]] == 1)
frac_drug_B_bin_417380_acc = 100*len(count_drug_B_bin_417380_acc[0])/len(count_drug_B_bin_417380[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('B','bin_417380',str(frac_drug_B_bin_417380_acc)))

count_drug_B_bin_417614_acc = np.where(y_data_pharm[count_drug_B_bin_417614[0]] == 1)
frac_drug_B_bin_417614_acc = 100*len(count_drug_B_bin_417614_acc[0])/len(count_drug_B_bin_417614[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('B','bin_417614',str(frac_drug_B_bin_417614_acc)))

count_drug_B_bin_417740_acc = np.where(y_data_pharm[count_drug_B_bin_417740[0]] == 1)
frac_drug_B_bin_417740_acc = 100*len(count_drug_B_bin_417740_acc[0])/len(count_drug_B_bin_417740[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('A','bin_417740',str(frac_drug_B_bin_417740_acc)))

count_drug_B_bin_999001_acc = np.where(y_data_pharm[count_drug_B_bin_999001[0]] == 1)
frac_drug_B_bin_999001_acc = 100*len(count_drug_B_bin_999001_acc[0])/len(count_drug_B_bin_999001[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('B','bin_999001',str(frac_drug_B_bin_999001_acc)))

# For drug c under different bins
count_drug_C_bin_417380_acc = np.where(y_data_pharm[count_drug_C_bin_417380[0]] == 1)
frac_drug_C_bin_417380_acc = 100*len(count_drug_C_bin_417380_acc[0])/len(count_drug_C_bin_417380[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('C','bin_417380',str(frac_drug_C_bin_417380_acc)))

count_drug_C_bin_417614_acc = np.where(y_data_pharm[count_drug_C_bin_417614[0]] == 1)
frac_drug_C_bin_417614_acc = 100*len(count_drug_C_bin_417614_acc[0])/len(count_drug_C_bin_417614[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('C','bin_417614',str(frac_drug_C_bin_417614_acc)))

count_drug_C_bin_417740_acc = np.where(y_data_pharm[count_drug_C_bin_417740[0]] == 1)
frac_drug_C_bin_417740_acc = 100*len(count_drug_C_bin_417740_acc[0])/len(count_drug_C_bin_417740[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('C','bin_417740',str(frac_drug_C_bin_417740_acc)))

count_drug_C_bin_999001_acc = np.where(y_data_pharm[count_drug_C_bin_999001[0]] == 1)
frac_drug_C_bin_999001_acc = 100*len(count_drug_C_bin_999001_acc[0])/len(count_drug_C_bin_999001[0])
print('The accepted fraction of drug %s claims that fall under %s are %s' %('C','bin_999001',str(frac_drug_C_bin_999001_acc)))

The accepted fraction of drug A claims that fall under bin_417380 are 58.30865899690347
The accepted fraction of drug A claims that fall under bin_417614 are 58.56022799159089
The accepted fraction of drug A claims that fall under bin_417740 are 58.39033539276258
The accepted fraction of drug A claims that fall under bin_999001 are 58.554612142585064
The accepted fraction of drug B claims that fall under bin_417380 are 58.520204638164564
The accepted fraction of drug B claims that fall under bin_417614 are 58.68642452278677
The accepted fraction of drug A claims that fall under bin_417740 are 58.76805669119651
The accepted fraction of drug B claims that fall under bin_999001 are 58.409551919736664
The accepted fraction of drug C claims that fall under bin_417380 are 58.56812705905492
The accepted fraction of drug C claims that fall under bin_417614 are 58.22705849736623
The accepted fraction of drug C claims that fall under bin_417740 are 58.55437533629606
The accepted fraction of drug

In [19]:
## This is the code chunk to perform cross validation

## Parameters to modify from each cross validation run to the next
## CV splits will be kept constant at 5 for 
## To change the code for inclusion of test confusion matrix: 1. Include the new columns in data frame definition
test_train_ratio = 0.33
rand_state = int(100*random.random())
print("The random state is",rand_state)

## According to sklearn documentation (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
## The output X and y variables must be lists; No random seed
X_data_pharm_train, X_data_pharm_test, y_data_pharm_train, y_data_pharm_test = train_test_split(X_data_pharm, y_data_pharm, test_size=test_train_ratio,random_state=rand_state)

# Initiate the kfold object to split training data set (X_data_pharm_train, y_data_pharm_train) 
kfold = KFold(n_splits = 5, shuffle = True) # Pay attention to the random state and the number of splits

# Convert training and testing data into numpy arrays - Note that the same variable is being used to store the data after conversion
# to a numpy array
X_data_pharm_train = np.array(X_data_pharm_train)
y_data_pharm_train = np.array(y_data_pharm_train)
X_data_pharm_test = np.array(X_data_pharm_test)
y_data_pharm_test = np.array(y_data_pharm_test)

# Variable to keep track of the test data set in cross-validation
i = 0

# Initiate the classifier
svclassifier_pharm = SVC(kernel='rbf', verbose=1, probability=True)

The random state is 53


In [26]:
# Initiate data frame to store the confusion matrix
df_confusion_final = pd.DataFrame(columns = ['cv_cf_1','cv_cf_2','cv_cf_3','cv_cf_4','test_cf_1','test_cf_2','test_cf_3','test_cf_4','test_train_ratio','cv_split','random_state','iteration','time_for_execution'])

# Run a for loop to run cross validation and get the confusion matrix
for train_index, test_index in kfold.split(X_data_pharm_train,y_data_pharm_train):
    print(i)
    
    # Assign train and test data sets
    X_train_cv, X_test_cv = X_data_pharm_train[train_index], X_data_pharm_train[test_index]
    y_train_cv, y_test_cv = y_data_pharm_train[train_index], y_data_pharm_train[test_index]
    
    # Cloning the classifier to avoid any conflicts
    clone_svclassifier_pharm = clone(svclassifier_pharm)
    
    start_time = time.time()
    
    # Classifier training using CV training data set
    clone_svclassifier_pharm.fit(X_train_cv, y_train_cv)
    print("Training is complete, Computing test data")
    
    # Predict y_test_cv and y_test and measure the accuracy
    y_test_cv_pred = clone_svclassifier_pharm.predict(X_test_cv)
    y_test_pred = clone_svclassifier_pharm.predict(X_data_pharm_test)
    
    ## Computational time estimation because it is one of the parameters which determines robustness of the model
    end_time = time.time()
    time_for_execution = end_time - start_time
    print("Time taken to execute this model is", time_for_execution, "seconds")
    
    ## Saving the model to file
    filename = 'cross_validation_models/SVM_%s_%s_%s.sav' % (str(test_train_ratio), str(rand_state), str(i+1)) #  SVM_Test ratio_CV Training data splits
    pickle.dump(clone_svclassifier_pharm, open(filename, 'wb'))

    # Saving the cv confusion matrix to a csv file
    a_cv = confusion_matrix(y_test_cv,y_test_cv_pred)
    print(a_cv)
    a_test = confusion_matrix(y_data_pharm_test,y_test_pred)
    print(a_test)
    df_addition = {'cv_cf_1':a_cv[0][0], 'cv_cf_2':a_cv[0][1], 'cv_cf_3':a_cv[1][0], 'cv_cf_4':a_cv[1][1],'test_cf_1':a_test[0][0], 'test_cf_2':a_test[0][1], 'test_cf_3':a_test[1][0], 'test_cf_4':a_test[1][1],'test_train_ratio':test_train_ratio,'cv_split':5,'random_state':rand_state,'iteration':i+1,'time_for_execution':time_for_execution}
    df_confusion_final = df_confusion_final.append(df_addition,ignore_index=True)
    
    i = i + 1
    
print("The model was successfully cross-validated")

0
[LibSVM]Training is complete, Computing test data
Time taken to execute this model is 1281.3688535690308 seconds
[[18900  3494]
 [    0 31029]]
[[375619  69125]
 [     0 623717]]
1
[LibSVM]Training is complete, Computing test data
Time taken to execute this model is 1245.177627325058 seconds
[[18746  3458]
 [    0 31219]]
[[375619  69125]
 [     0 623717]]
2
[LibSVM]Training is complete, Computing test data
Time taken to execute this model is 1292.5695354938507 seconds
[[18941  3358]
 [    0 31124]]
[[375619  69125]
 [     0 623717]]
3
[LibSVM]Training is complete, Computing test data
Time taken to execute this model is 1275.5492026805878 seconds
[[18742  3444]
 [    0 31237]]
[[375619  69125]
 [     0 623717]]
4
[LibSVM]Training is complete, Computing test data
Time taken to execute this model is 1625.8389551639557 seconds
[[18609  3515]
 [    0 31299]]
[[375619  69125]
 [     0 623717]]
The model was successfully cross-validated


In [27]:
## Write the datarframe df_c
print(df_confusion_final)
# df_confusion_final.to_excel('cv_test_stats.xlsx',mode='a',header='false')
df_confusion_final.to_csv('cv_test_stats.csv',mode='a',header='false')

   cv_cf_1  cv_cf_2  cv_cf_3  cv_cf_4  test_cf_1  test_cf_2  test_cf_3  \
0  18900.0   3494.0      0.0  31029.0   375619.0    69125.0        0.0   
1  18746.0   3458.0      0.0  31219.0   375619.0    69125.0        0.0   
2  18941.0   3358.0      0.0  31124.0   375619.0    69125.0        0.0   
3  18742.0   3444.0      0.0  31237.0   375619.0    69125.0        0.0   
4  18609.0   3515.0      0.0  31299.0   375619.0    69125.0        0.0   

   test_cf_4  test_train_ratio  cv_split  random_state  iteration  \
0   623717.0               0.8       5.0          67.0        1.0   
1   623717.0               0.8       5.0          67.0        2.0   
2   623717.0               0.8       5.0          67.0        3.0   
3   623717.0               0.8       5.0          67.0        4.0   
4   623717.0               0.8       5.0          67.0        5.0   

   time_for_execution  
0         1281.368854  
1         1245.177627  
2         1292.569535  
3         1275.549203  
4         1625.83895

In [None]:
## Predicting accuracy for the test set after cross-validation
## 1. Load the model 2. Predict y_test 3. Get confusion matrix 4. Store results in an excel sheet
## Questions: 1. Does the model store values such as test_train_ratio?
file_to_open = 'cross_validation_models/SVM_0.9_0_5.sav'
loaded_model = pickle.load(open(file_to_open, 'rb'))
print(loaded_model)

y_test_pred = loaded_model.predict(X_test)
df_confusion_final_test = pd.DataFrame(columns = ['cf_1','cf_2','cf_3','cf_4','test_train_ratio','cv_split','random_state','iteration'])

a_test = confusion_matrix(y_test_cv,y_test_cv_pred)
print(a_test)

df_addition = {'cf_1':a[0][0], 'cf_2':a[0][1], 'cf_3':a[1][0], 'cf_4':a[1][1],'test_train_ratio':0.9,'cv_split':5,'random_state':0,'iteration':4}
df_confusion_final_test = df_confusion_final_test.append(df_addition,ignore_index=True)

# df_confusion_final.to_excel('BN_test_stats.xlsx',mode='a',header='false')

In [None]:
y_pred_pharm = svclassifier_pharm.predict(X_data_pharm_test)
print(confusion_matrix(y_pred_pharm, y_data_pharm_test))
print(classification_report(y_pred_pharm, y_data_pharm_test))