In [None]:
#**************************************README FIRST***************************************
#This is a sample code file, which should be fully functional, for the PICC ML project at 
#Cincinnati Children's Hospital Medical Center.  All correspondence should go to
#Manan Shah (mas476@gmail.com).   
#
#BACKGROUND
#n critically ill infants, the position of a peripherally inserted central catheter (PICC) 
#must be confirmed frequently, as the tip may move from its original position and 
#run the risk of hyperosmolar vascular damage or extravasation into surrounding spaces. 
#
#Automated detection of PICC tip position holds great promise for alerting bedside clinicians 
#to non-central PICCs.  
#
#Objectives  
#This research seeks to use natural language processing (NLP) and supervised machine learning (ML) 
#techniques to predict PICC tip position based primarily on text analysis of radiograph reports 
#from infants with an upper extremity PICC. 
#
#To use this file: make sure you have Python 3.7 and scikit learn, spacy, pandas all properly
#installed.   Then you need a "JSON" file of radiology reports with appropriate section (eg PICC line)
#(X AXIS) with Y Axis being the label.  You can either do it binary (e.g central vs non central)
#or run it as a 12 classification model - you only need to change your labels.
#It would be trivial to modify it to take other types of files, databases, etc.  We just chose
#JSON as an easily available and manipulable format.
#
#You can set your classifier and your hyperparameters in the grid, and the rest of the work
#should be automatic.  We have created a function for pre-processing where you can do additional
#preprocessing, though we did not do much pre-processing for our work.  Depending on the nature of 
#your inputs, more pre-processing might be warranted (e.g you may want to remove stop words).  Note
#that there are lots of publically available functions for this which may be useful.
#
#Finally, once your model is saved, you can simply load it and use the 'predict' function to 
#test it out.  Note that some pretrained models are also available at https://www.picclocation.com
#***********************************************************************************************


#IMPORT OUR LIBRARIES
import pandas as pd
import json
import string
import spacy
import re
import warnings
import gensim
import numpy as np


#MAKE SURE WE HAVE ALL THE SKLEARN ML algorithms imported
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.ensemble import AdaBoostRegressor
from sklearn.pipeline import Pipeline
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from gensim.models import word2vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


#ignore warnings in result - comment out when trying new stuff!!
warnings.filterwarnings('ignore')
%matplotlib inline

# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

parser = English()


#This is a tokenizer that you can use if you wanted.  This is for demonstration purposes only
#This is *NOT* used in our final ML model, though we can add it if we need to
def spacy_tokenizer(sentence):
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    #mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens




#There are two ways to approach this problem.  Currently, we have a file called 
#'train' and a file called 'test'.  The train file contains the ~70% of our records
# that are for training, while the 'test' file contains the 30% of our records that are for 
#'testing'. So we will train on the training data, figure out what works best, and then test
#on the final testing data
df = pd.read_json('train.json')
#Column names of the x axis and labels
X_AXIS = 'report'
Y_AXIS = 'final_location'
X_train = df[X_AXIS] # the features we want to analyze
y_train = df[Y_AXIS] # the labels, or answers, we want to test against



#If you want to just have one file and do an automated random split, you can also do it 
#this way
#X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, shuffle=True)

#*********************************************************************************
#            CHANGE THIS EVERY TIME YOU WANT TO TEST A NEW MODEL
#**********************************************************************************

#Classifier = SVC() with kernel = 'linear' would also work and likely give similar results
classifier = LinearSVC()

#Set the parameters we want to test for our model
param_grid = {'classifier__C': [0.1,0.2,1,10,100],
              'classifier__multi_class': ['crammer_singer', 'ovr'],
              'classifier__loss': ['squared_hinge', 'hinge']
             }
        
#If we wanted to build neural network, just put the neural network classifier here, e.g. MLPC
#and put the parameter grid here, eg:
#Activation: tanh, relu 
#Solver: sgd, adam 
#Alpha: 0.0001, 0.05, 0.01 
#Learning Rate: constant, adaptive 
#Hidden Layer Sizes: (50,50,50), 100,50,50 , 50,100,50 etc etc

#This is your n-gram range - you can customize this to be a bigram, tri-gram, etc
#Depending on the model and your input data, various ranges may work better
bow_vector = CountVectorizer(ngram_range=(1,5))
#tfidf_vector = TfidfVectorizer
#*********************************************************************************
#            END SECTION TO CHANGE - THE REST OF CODE SHOULD BE RELATIVELY SIMILAR IN BETWEEN MODELS
#**********************************************************************************
             
              
        

#Now this is a pipeline that we built - you can add more functions
#For example, if you want to pre-process your data, such as if you wanted to remove
#stop words, you can add a function here that will do all of these things.
#It would be very easy to implement.
#For our purposes, we decided to do very minimal pre-processing (e.g not remove stop words)
pipe = Pipeline([    ('vectorizer', bow_vector),
                     ('classifier', classifier)])


#10 cross-fold validaton, set train score to false if you don't want to see training set
grid = GridSearchCV(pipe, param_grid, refit = True, verbose = 3, n_jobs=-1, cv=10, return_train_score=True, scoring='accuracy') 

# fitting the model for grid search 
grid.fit(X_train, y_train) 

# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 
              
df_r = pd.DataFrame(grid.cv_results_)
df_r

In [None]:
#OK now we have the best parameters, so now you have this model (grid) and then you p
df = pd.read_json('final_data_test.json')
#Column names of the x axis and labels
X_AXIS = 'report'
Y_AXIS = 'final_location'
X_test = df[X_AXIS] # the features we want to analyze
y_test = df[Y_AXIS] # the labels, or answers, we want to test against

grid_predictions = grid.predict(X_test) 

# print classification report 
print(metrics.classification_report(y_test, grid_predictions))
print(grid)

#Print all the answers the model got wrong
for input, prediction, label in zip(X_test, grid_predictions, y_test):
    if prediction != label:
        print(input, 'has been classified as ', prediction, 'and should be ', label, '\n') 

In [None]:
from joblib import dump
#Now you can save your model, or train the model on the WHOLE DATASET for production use
dump(grid, filenameM ("NAME.joblib")