In [15]:
#all the imports used in the program

import pandas as pd 
import pyodbc
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import categorical_accuracy
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Bidirectional, LSTM
from keras.layers import GlobalMaxPool1D, Conv1D, Dropout, GRU, Flatten, MaxPooling1D
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import sklearn.metrics as skm

In [16]:
#Grab data from a database

def getData(Server, UID, PWD, Database, query):    
    
    #create a SQL connection based on the given server and database
    sql_conn = pyodbc.connect('DRIVER={SQL Server};'
                              'SERVER='+Server+';' 
                              'UID='+UID+';'
                              'PWD='+PWD+';'
                              'DATABASE='+Database+';' )
    
    #return the data from the given Query and SQL connection,
    return pd.read_sql(query, sql_conn)

In [43]:
#establish my server and corresponding database to pull data from
server ='GSDEMO2HOST'
database = 'MClinical'
UID = 'gsanalytics'
PWD = 'G3st@lt'


#Stores the result in a pandas DataFrame object called data
query ="SELECT ISNULL(ProcedureStep.subSpecialtyCd,'UNKNOWN') AS subSpecialtyCd, FillerOrder.fillerOrderStatusCd, PL.locationName, BUCKETS1.BUCKETNUM FROM BUCKETS1 LEFT JOIN ProcedureStep ON BUCKETS1.PSKEY = ProcedureSteP.procedureStepKey LEFT JOIN RequestedProcedure ON ProcedureStep.requestedProcedureKey = RequestedProcedure.requestedProcedureKey LEFT JOIN FillerOrder ON RequestedProcedure.fillerOrderKey = FillerOrder.fillerOrderKey LEFT JOIN LOCATION AS PL ON FillerOrder.scheduledLocationKey = PL.locationKey"
original = getData(server, UID, PWD, database,query)
data = original.copy()


In [44]:
#concatinate the selected params
def concatParam(df):
    return  df['subSpecialtyCd'] + ' ' + df['fillerOrderStatusCd'] + ' ' + df['locationName']

In [45]:
def clean(df):
    #create a new column where its all the params concatinated by spaces
    df['concat'] = concatParam(df)
    
    #change the case of all the words to lower case so there is no case sensitivity.
    df['concat'] = df['concat'].str.lower()
    return df['concat']

In [46]:
data['concat'] = clean(data)
data.concat.apply(lambda x: len(x.split(" "))).mean()

6.0

In [47]:
def convert_text(df, maxlen, max_words):
    #split df into two series
    #texts being the concat
    #labels being the cooresponding bucket
    texts = df.concat
    labels = df.BUCKETNUM
    
    #convert the series into numpy arrays
    texts = texts.values
    temp_labels = labels.values
    
    #create a empty array for our labels that we will convert to integers
    labels = []
    
    #grab all the unique buckets (5) 
    #this will be our dictonary for mapping between integers and buckets
    label_dict = [1,2,3,4,5]
    label_dict = np.asarray(label_dict)

    
    for label_type in temp_labels:
        labels.append(np.searchsorted(label_dict, label_type))
    
    labels = np.asarray(labels)
    
    #create a tokenizer based on the max_words
    #fit the tokenizer to our specific texts
    #change our texts to a vetorized integer
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    
    #pad sequences ensures that all our vectors are of the same length
    x = pad_sequences(sequences, maxlen=maxlen)
    
    

    print('Shape of data tensor:', x.shape)
    print('Shape of label tensor:', labels.shape)
    
    #return x, labels, and the last 7000 of x and labels for testing
    return x, labels

In [48]:
#define maxlen as the maximum words to take from each sectionValue
#define max_words as the total number of unique words to tokenize

maxlen = 8
max_words = 20

#create data that can be ran through our model
x_test, y_test = convert_text(data, maxlen, max_words)

print(x_test.shape)
print(y_test.shape)

Found 15 unique tokens.
Shape of data tensor: (10, 8)
Shape of label tensor: (10,)
(10, 8)
(10,)


In [49]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model.h5")
print("Loaded model from disk")

model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])

Loaded model from disk


In [50]:
#test the model on our set aside testing data
def test_model(model, x_test):
    #gather the models prediction 
    
    #the model displays its prediction as a percent for every bucket at how confident the model is for 
    #each bucket. 
    
    #the highest percent in our case the the bucket the model has choosen
    preds = model.predict(x_test)
    
    y_pred = []
    
    #for ever row in the prediction list
    #grab the max value and append that index to the y_pred
    for row in preds:
        y_pred.append(np.argmax(row))
    
    #convert the list to a numpy array
    return np.asarray(y_pred)

In [51]:
#test the model against our test data and store the predictions in y_pred
y_pred = test_model(model, x_test)

In [52]:
#create the confusion matrix
def test_confusion_matrix(y_pred,y_test):
    print(confusion_matrix(y_test, y_pred))

In [53]:
#create the classification report
def test_classification_report(y_pred,y_test):
    print(skm.classification_report(y_test, y_pred))

In [54]:
test_confusion_matrix(y_pred,y_test)

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 1 0 0 3]
 [6 0 0 0 0]
 [0 0 0 0 0]]


In [55]:
#display the classification report on the predictions
test_classification_report(y_pred, y_test)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       4.0
           3       0.00      0.00      0.00       6.0
           4       0.00      0.00      0.00       0.0

    accuracy                           0.00      10.0
   macro avg       0.00      0.00      0.00      10.0
weighted avg       0.00      0.00      0.00      10.0



In [21]:
#drop first n rows
def drop_first_n_rows(df, n):
    return df.iloc[n:]

In [22]:
#drop frist N rows in data
#WHERE N = #OF TRAIN DATA
data = drop_first_n_rows(data, 8000)

In [35]:
#convert the prediction number to bucket number
def convert_to_bucket(Y):
    #the prediction number is the index of the array 1,2,3,4,5 
    #just add 1 to index to correct the bucket number
    
    #for every index in y
    for row in range(0, len(Y)):
        #replace the value at the index with value + 1
        Y[row] = Y[row] + 1
        
    #return Y
    return Y

In [36]:
#convert pred
pred = convert_to_bucket(y_pred)

#convert test
test = convert_to_bucket(y_test)

In [37]:
#convert our n x 1 numpy array to a pandas df
pdpred = pd.DataFrame(pred)

In [38]:
#add the pred list to the original dataframe

data['pred'] = pdpred.values

In [39]:
#drop all the useless data

data = data.drop(['subSpecialtyCd', 'fillerOrderStatusCd', 'locationName'], axis =1)

In [40]:
#reindex the columns of the data
data = data.reindex(columns=['concat', 'BUCKETNUM', 'pred'])

In [41]:
#display the data
data.head(60)

Unnamed: 0,concat,BUCKETNUM,pred
0,*gp f holy family hospital - incyte,3,2
1,*gyp f incyte diagnostics walla walla,4,1
2,*gyp f incyte diagnostics walla walla,4,1
3,*gp f kadlec medical center,3,5
4,*op f incyte diagnostics spokane,3,5
5,*gyp f incyte diagnostics walla walla,4,1
6,*gip f holy family hospital - incyte,3,5
7,*gyp f incyte diagnostics walla walla,4,1
8,*gyp f incyte diagnostics walla walla,4,1
9,*gyp f incyte diagnostics walla walla,4,1
