## Recurrent Neuronetwork

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [2]:
df = pd.read_csv('./triagedata.csv')

In [4]:
# Create input and output vectors.
# Process the labels.
X = df.description
Y = df.medical_specialty
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [8]:
#split data into train and test sets
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [9]:
# data process
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [10]:
# define model
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [11]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)  

In [24]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 422 samples, validate on 106 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x13637a1d0>

In [25]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [26]:
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: -3.162
  Accuracy: 0.553


## Gradient Boosting Trees

In [62]:
df_x = df.description
df_y = df.medical_specialty

In [69]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(row.split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(row.split())
                                     for index, row in enumerate(df_x)]))


In [70]:
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.model_selection import cross_val_score

pl_xgb = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
                         ('xgboost', xgb.XGBClassifier(objective='multi:softmax'))])
scores = cross_val_score(pl_xgb, df_x, df_y, cv=5)
print('Accuracy for XGBoost Classifier : ', scores.mean())

100%|██████████| 496/496 [00:00<00:00, 793733.23it/s]
100%|██████████| 496/496 [00:00<00:00, 1445708.68it/s]
100%|██████████| 496/496 [00:00<00:00, 1550204.76it/s]
100%|██████████| 496/496 [00:00<00:00, 1153201.10it/s]
100%|██████████| 496/496 [00:00<00:00, 1161571.63it/s]
100%|██████████| 496/496 [00:00<00:00, 2033601.94it/s]
100%|██████████| 496/496 [00:00<00:00, 1917396.11it/s]
100%|██████████| 496/496 [00:00<00:00, 1053354.32it/s]
100%|██████████| 496/496 [00:00<00:00, 1320034.76it/s]
100%|██████████| 496/496 [00:00<00:00, 1503160.97it/s]
100%|██████████| 496/496 [00:00<00:00, 1082964.49it/s]
100%|██████████| 496/496 [00:00<00:00, 1075685.00it/s]
100%|██████████| 496/496 [00:00<00:00, 970776.85it/s]
100%|██████████| 496/496 [00:00<00:00, 1255506.81it/s]
100%|██████████| 496/496 [00:00<00:00, 1886105.88it/s]
100%|██████████| 496/496 [00:00<00:00, 1053354.32it/s]
100%|██████████| 496/496 [00:00<00:00, 1966327.77it/s]
100%|██████████| 496/496 [00:00<00:00, 1156406.22it/s]
100%|███████

Accuracy for XGBoost Classifier :  0.5481210895058259


## Random Forest

In [72]:
from sklearn.ensemble import RandomForestClassifier

pl_xgb = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
                         ('random_forest', RandomForestClassifier())])
scores = cross_val_score(pl_xgb, df_x, df_y, cv=5)
print('Accuracy for XGBoost Classifier : ', scores.mean())

100%|██████████| 496/496 [00:00<00:00, 621564.02it/s]
100%|██████████| 496/496 [00:00<00:00, 1341311.92it/s]
100%|██████████| 496/496 [00:00<00:00, 1778098.11it/s]
100%|██████████| 496/496 [00:00<00:00, 1246479.80it/s]
100%|██████████| 496/496 [00:00<00:00, 2000360.37it/s]
100%|██████████| 496/496 [00:00<00:00, 1590500.60it/s]
100%|██████████| 496/496 [00:00<00:00, 1636801.56it/s]
100%|██████████| 496/496 [00:00<00:00, 2092932.38it/s]
100%|██████████| 496/496 [00:00<00:00, 1206713.91it/s]
100%|██████████| 496/496 [00:00<00:00, 923789.87it/s]
100%|██████████| 496/496 [00:00<00:00, 840555.47it/s]
100%|██████████| 496/496 [00:00<00:00, 1143063.07it/s]
100%|██████████| 496/496 [00:00<00:00, 139369.92it/s]
100%|██████████| 496/496 [00:00<00:00, 1233911.50it/s]
100%|██████████| 496/496 [00:00<00:00, 1584443.86it/s]
100%|██████████| 496/496 [00:00<00:00, 1393419.15it/s]
100%|██████████| 496/496 [00:00<00:00, 944766.02it/s]
100%|██████████| 496/496 [00:00<00:00, 1018792.74it/s]
100%|██████████

Accuracy for XGBoost Classifier :  0.5771564697213794




In [73]:
df

Unnamed: 0,description,medical_specialty
0,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary
1,2-D Echocardiogram,Cardiovascular / Pulmonary
2,2-D Echocardiogram,Cardiovascular / Pulmonary
3,Echocardiogram and Doppler,Cardiovascular / Pulmonary
4,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary
5,Cerebral Angiogram - moyamoya disease.,Neurology
6,Surgical removal of completely bony impacted...,Dentistry
7,Neck exploration; tracheostomy; urgent flexib...,Cardiovascular / Pulmonary
8,EEG during wakefulness and light sleep is abn...,Neurology
9,A pleasant gentleman with a history of Wilson...,Neurology
