In [None]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np 
import time

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2, f_regression, SelectFromModel, VarianceThreshold, RFE

from mlxtend.feature_selection import SequentialFeatureSelector

import operator

from sklearn.pipeline import Pipeline

#nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords 
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# classifiers
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC,  LinearSVC
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold


# Model and Performance Metrics
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_recall_fscore_support




In [2]:
mt_sample = pd.read_csv('mtsamples.csv')
mt_sample.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [3]:
print(mt_sample.shape)
mt_subsample = mt_sample[['transcription', 'medical_specialty']]
mt_subsample = mt_subsample.drop(mt_subsample[mt_subsample['transcription'].isna()].index)
mt_subsample.shape

(4999, 6)


(4966, 2)

In [4]:
mt_subsample["medical_specialty"].value_counts()

 Surgery                          1088
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        371
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  224
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Urology                           156
 Obstetrics / Gynecology           155
 Discharge Summary                 108
 ENT - Otolaryngology               96
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    61
 Psychiatry / Psychology            53
 Office Notes                       50
 Podiatry                           47
 Dermatology                        29
 Cosmetic / Plastic Surgery         27
 Dentistry               

In [5]:
counts = mt_subsample['medical_specialty'].value_counts()
mt_subsample_370 = mt_subsample[mt_subsample['medical_specialty'].isin(counts[counts > 370].index)]
mt_subsample_370['labels'] = LabelEncoder().fit_transform(mt_subsample_370['medical_specialty'])
mt_subsample_370['medical_specialty'].value_counts()

 Surgery                       1088
 Consult - History and Phy.     516
 Cardiovascular / Pulmonary     371
Name: medical_specialty, dtype: int64

In [6]:
mt_subsample_370 = pd.DataFrame(mt_subsample_370)
samp_cardio = mt_subsample_370[mt_subsample_370["labels"]==0].sample(370)
print(len(samp_cardio))
samp_consult = mt_subsample_370[mt_subsample_370["labels"]==1].sample(370)
print(len(samp_consult))
samp_surgery = mt_subsample_370[mt_subsample_370["labels"]==2].sample(370)
print(len(samp_surgery))
mt_train_test=pd.concat([samp_cardio , samp_consult , samp_surgery])
mt_train_test = mt_train_test.sample(frac=1)
print(mt_train_test.shape)

370
370
370
(1110, 3)


In [7]:
def preprocess(narratives, stem=False,lem=True):
    collection = []
    tokenizer=RegexpTokenizer(r"\w+")
    for row in narratives:
        tokens=tokenizer.tokenize(row)
       
        if lem==True:
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(t,pos="v") for t in tokens]
        elif stem==True:
            stemmer = PorterStemmer()
            tokens = [stemmer.stem(t) for t in tokens]
        tokens = [word for word in tokens if word.isalpha()]#Just keep the words and removes numbers
        tokens = [w.lower() for w in tokens]#words to lower case
        tokens = [w for w in tokens if not w in stop_words]#Removing the stop words
        collection.append(" ".join(tokens))
    return collection


In [8]:
#Ref: https://stackoverflow.com/questions/25462407/fast-information-gain-computation/38582697#38582697
def information_gain(X, y):

    def _calIg():
        entropy_x_set = 0
        entropy_x_not_set = 0
        for c in classCnt:
            probs = classCnt[c] / float(featureTot)
            entropy_x_set = entropy_x_set - probs * np.log(probs)
            probs = (classTotCnt[c] - classCnt[c]) / float(tot - featureTot)
            entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
        for c in classTotCnt:
            if c not in classCnt:
                probs = classTotCnt[c] / float(tot - featureTot)
                entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
        return entropy_before - ((featureTot / float(tot)) * entropy_x_set
                             +  ((tot - featureTot) / float(tot)) * entropy_x_not_set)

    tot = X.shape[0]
    classTotCnt = {}
    entropy_before = 0
    for i in y:
        if i not in classTotCnt:
            classTotCnt[i] = 1
        else:
            classTotCnt[i] = classTotCnt[i] + 1
    for c in classTotCnt:
        probs = classTotCnt[c] / float(tot)
        entropy_before = entropy_before - probs * np.log(probs)

    nz = X.T.nonzero()
    pre = 0
    classCnt = {}
    featureTot = 0
    information_gain = []
    for i in range(0, len(nz[0])):
        if (i != 0 and nz[0][i] != pre):
            for notappear in range(pre+1, nz[0][i]):
                information_gain.append(0)
            ig = _calIg()
            information_gain.append(ig)
            pre = nz[0][i]
            classCnt = {}
            featureTot = 0
        featureTot = featureTot + 1
        yclass = y[nz[1][i]]
        if yclass not in classCnt:
            classCnt[yclass] = 1
        else:
            classCnt[yclass] = classCnt[yclass] + 1
    ig = _calIg()
    information_gain.append(ig)

    return np.asarray(information_gain)

In [9]:
mt_train_test_preprocessed = mt_train_test
print(mt_train_test.shape)
mt_train_test_preprocessed["transcription"] =  preprocess(mt_train_test_preprocessed["transcription"])
mt_train_test_preprocessed = mt_train_test_preprocessed.drop(mt_train_test_preprocessed[mt_train_test_preprocessed['transcription'].isna()].index)
print(mt_train_test_preprocessed.shape)
mt_train_test_preprocessed.head()
mt_train_test_preprocessed.to_csv("preprocessed.csv")

(1110, 3)
(1110, 3)


In [10]:
#Splitting Training and Test Dataset
X = mt_train_test_preprocessed["transcription"]
X=X.reset_index(drop=True)  #Ref: https://stackoverflow.com/questions/43307156/sklearns-kfold-generates-nan-values
y = mt_train_test_preprocessed["medical_specialty"]
y=y.reset_index(drop=True)
skf = StratifiedKFold(n_splits=5,random_state = 7, shuffle=True)
print("n_splits",skf.get_n_splits(X, y))
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
#     print(X_train, X_test)
    y_train, y_test = y[train_index], y[test_index]
print("Train: ", X_train.shape,y_train.shape)
print("Test: ", y_test.shape,y_test.shape)


X_train = pd.DataFrame(X_train).reset_index(drop=True)
#X_train = X_train.drop(X_train[X_train['transcription'].isna()].index)
print(X_train.shape)
# print(X_train.head())
X_test = pd.DataFrame(X_test).reset_index(drop=True)
y_train = pd.DataFrame(y_train).reset_index(drop=True)
# print(y_train.shape)
y_test = pd.DataFrame(y_test).reset_index(drop=True)
vect = TfidfVectorizer()
X_train_vec = vect.fit_transform(X_train["transcription"])
X_test_vec = vect.transform(X_test["transcription"])
print("X_train_vec",X_train_vec.shape)
print("X_test_vec",X_test_vec.shape)

n_splits 5
Train:  (888,) (888,)
Test:  (222,) (222,)
(888, 1)
X_train_vec (888, 12170)
X_test_vec (222, 12170)


In [11]:
#No Feature Selection

models_dic={"RF": RandomForestClassifier(random_state=1)}
for name, model in models_dic.items():
    clf = Pipeline([('vect', TfidfVectorizer()), ('model', model)])
    scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                           mt_train_test_preprocessed["medical_specialty"], cv=5))
    print("Accuracy for {} is {}".format(name,round(scores_tfidf,3)))


Accuracy for RF is 0.709


In [35]:
words_num =200
FS_methods = {"ELR" : SelectFromModel(LogisticRegression(), max_features = words_num), 
              "ExT": SelectFromModel(ExtraTreesClassifier(n_estimators=50), max_features = words_num),
              "Chi2" : SelectKBest(chi2, k=words_num),
              #"RFE": RFE(estimator=SVC(kernel="linear", C=1), n_features_to_select=words_num, step=1),
              "MuI": 0 ,
              "IG" : 0 ,
              "L1":  SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
              "V": VarianceThreshold(0.0001999),
             } 
df_words = pd.DataFrame(columns = list(FS_methods.keys()))

for name, model in FS_methods.items():
    count_vect = CountVectorizer(input="transcription", analyzer="word")
    X_train_counts = count_vect.fit_transform(X_train["transcription"])
    #     print("features before fs:", X_train_counts.shape)
    tf_transformer = TfidfTransformer(use_idf=True)
    traindata = tf_transformer.fit_transform(X_train_counts)
    print("features before fs: ", traindata.shape) #report size of the training data
    if name == "MuI":
        res = dict(zip(count_vect.get_feature_names(),
                           mutual_info_classif(traindata, y_train["medical_specialty"], discrete_features=True)
                           ))
            #print(res)
        MI_ordered = sorted(res, key=res.get)
        MI_FS = MI_ordered [:words_num]
        df_words[name] =  MI_FS
    elif name == "IG":
        IG_dict = dict(zip(count_vect.get_feature_names(),
                       information_gain(traindata, y_train["medical_specialty"])
                       ))
        IG_ordered = sorted(IG_dict, key=IG_dict.get)
        IG_FS = IG_ordered [:words_num]
        df_words[name] = IG_FS

    else:   
        X_transform = model.fit_transform(traindata, y_train["medical_specialty"])
        print("reduced features for {}:{} ".format(name, X_transform.shape))
            #get the names of all features
        words = np.array(count_vect.get_feature_names())
          #         print(words[:10])
            #get the names of the important features using the boolean index from model 
        if X_transform.shape[1] < words_num:
            red_length = X_transform.shape[1]
            print(red_length)
            df_words[name] = words[model.get_support()][:red_length] 
        else:
            df_words[name] = words[model.get_support()][:words_num]
           

features before fs:  (888, 12170)
reduced features for ELR:(888, 200) 
features before fs:  (888, 12170)
reduced features for ExT:(888, 200) 
features before fs:  (888, 12170)
reduced features for Chi2:(888, 200) 
features before fs:  (888, 12170)
features before fs:  (888, 12170)
features before fs:  (888, 12170)
reduced features for L1:(888, 297) 
features before fs:  (888, 12170)
reduced features for V:(888, 1204) 


In [36]:
df_words.to_csv("word.csv")
df_words.head()

Unnamed: 0,ELR,ExT,Chi2,MuI,IG,L1,V
0,ago,abdomen,advance,abciximab,abandon,able,abdomen
1,alcohol,alcohol,age,accelerate,acetic,actually,abdominal
2,alert,alert,ago,acidophilus,adequacy,ago,able
3,allergies,allergies,alcohol,acidotic,adhesives,albuterol,abnormal
4,also,anesthesia,alert,acs,albeit,alert,abnormalities


In [44]:
words_count = round(0.10 * X_train_vec.shape[1])
FS_methods = {"ELR" : SelectFromModel(LogisticRegression(), max_features = words_count), 
              "ExT": SelectFromModel(ExtraTreesClassifier(n_estimators=50), max_features = words_count),
              "V": VarianceThreshold(0.0001999),
              "Chi2" : SelectKBest(chi2, k=words_count),
              "L1":  SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
#               "RFE": RFE(estimator=SVC(kernel="linear", C=1), n_features_to_select=words_count, step=1),
#               "MuI": 0 ,
#               "IG" : 0 ,
             } 


models_dic={"RF": RandomForestClassifier(random_state=1)}
res_accuracy = pd.DataFrame()
for name, model in models_dic.items():
    for fs_name, fs_model in FS_methods.items():
        for j in [0.10, 0.20, 0.30, 0.40, 0.50]:
            words_count = round(j * X_train_vec.shape[1])
            if fs_name == "MuI":
                res = dict(zip(count_vect.get_feature_names(),
                                   mutual_info_classif(traindata, y_train["medical_specialty"], discrete_features=True)
                                   ))
                    #print(res)
                MI_ordered = sorted(res, key=res.get)
                MI_FS = MI_ordered[:words_count]
                clf = Pipeline([('vect', TfidfVectorizer()), ('feature_selection', fs_model), 
                            ('model', model)])
                scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                                   mt_train_test_preprocessed["medical_specialty"], cv=5))
                print("Accuracy for {}_{} is {}".format(name,fs_name,round(scores_tfidf,3)))    
            elif fs_name == "IG": 
                IG_dict = dict(zip(count_vect.get_feature_names(),
                               information_gain(traindata, y_train["medical_specialty"])
                               ))
                IG_ordered = sorted(IG_dict, key=IG_dict.get)
                IG_FS = IG_ordered [:words_count]
                clf = Pipeline([('vect', TfidfVectorizer()), ('feature_selection', fs_model), 
                            ('model', model)])
                scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                                   mt_train_test_preprocessed["medical_specialty"], cv=5))
                print("Accuracy for {}_{} is {}".format(name,fs_name,round(scores_tfidf,3)))
            else: 
                #print(words_count)
                clf = Pipeline([('vect', TfidfVectorizer()), ('feature_selection', fs_model), 
                            ('model', model)])
                scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                                   mt_train_test_preprocessed["medical_specialty"], cv=5))
                print("Accuracy for {}_{}_{} is {}".format(name,fs_name,j, round(scores_tfidf,3)))    



1217
Accuracy for RF_ELR_0.1 is 0.723
2434
Accuracy for RF_ELR_0.2 is 0.723
3651
Accuracy for RF_ELR_0.3 is 0.723
4868
Accuracy for RF_ELR_0.4 is 0.723
6085
Accuracy for RF_ELR_0.5 is 0.723
1217
Accuracy for RF_ExT_0.1 is 0.716
2434
Accuracy for RF_ExT_0.2 is 0.711
3651
Accuracy for RF_ExT_0.3 is 0.724
4868
Accuracy for RF_ExT_0.4 is 0.722
6085
Accuracy for RF_ExT_0.5 is 0.712
1217
Accuracy for RF_V_0.1 is 0.709
2434
Accuracy for RF_V_0.2 is 0.709
3651
Accuracy for RF_V_0.3 is 0.709
4868
Accuracy for RF_V_0.4 is 0.709
6085
Accuracy for RF_V_0.5 is 0.709
1217
Accuracy for RF_Chi2_0.1 is 0.717
2434
Accuracy for RF_Chi2_0.2 is 0.717
3651
Accuracy for RF_Chi2_0.3 is 0.717
4868
Accuracy for RF_Chi2_0.4 is 0.717
6085
Accuracy for RF_Chi2_0.5 is 0.717
1217
Accuracy for RF_L1_0.1 is 0.733
2434
Accuracy for RF_L1_0.2 is 0.737
3651
Accuracy for RF_L1_0.3 is 0.734
4868
Accuracy for RF_L1_0.4 is 0.737
6085
Accuracy for RF_L1_0.5 is 0.73


In [47]:
words_count = round(0.10 * X_train_vec.shape[1])
FS_methods = {"ELR" : SelectFromModel(LogisticRegression(), max_features = words_count), 
              "ExT": SelectFromModel(ExtraTreesClassifier(n_estimators=50), max_features = words_count),
              "V": VarianceThreshold(0.0001999),
              "Chi2" : SelectKBest(chi2, k=words_count),
              "L1":  SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
              "RFE": RFE(estimator=SVC(kernel="linear", C=1), n_features_to_select=words_count, step=1),
#               "MuI": 0 ,
#               "IG" : 0 ,
             } 


models_dic={"RF": RandomForestClassifier(random_state=1)}
for name, model in models_dic.items():
    for fs_name, fs_model in FS_methods.items():
        for j in [ 0.20]:
            words_count = round(j * X_train_vec.shape[1])
            if fs_name == "MuI":
                res = dict(zip(count_vect.get_feature_names(),
                                   mutual_info_classif(traindata, y_train["medical_specialty"], discrete_features=True)
                                   ))
                    #print(res)
                MI_ordered = sorted(res, key=res.get)
                MI_FS = MI_ordered[:words_count]
                clf = Pipeline([('vect', TfidfVectorizer()), ('feature_selection', fs_model), 
                            ('model', model)])
                scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                                   mt_train_test_preprocessed["medical_specialty"], cv=5))
                print("Accuracy for {}_{} is {}".format(name,fs_name,round(scores_tfidf,3)))    
            elif fs_name == "IG": 
                IG_dict = dict(zip(count_vect.get_feature_names(),
                               information_gain(traindata, y_train["medical_specialty"])
                               ))
                IG_ordered = sorted(IG_dict, key=IG_dict.get)
                IG_FS = IG_ordered [:words_count]
                clf = Pipeline([('vect', TfidfVectorizer()), ('feature_selection', fs_model), 
                            ('model', model)])
                scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                                   mt_train_test_preprocessed["medical_specialty"], cv=5))
                print("Accuracy for {}_{} is {}".format(name,fs_name,round(scores_tfidf,3)))
            else: 
                print(words_count)
                clf = Pipeline([('vect', TfidfVectorizer()), ('feature_selection', fs_model), 
                            ('model', model)])
                scores_tfidf = np.mean(cross_val_score(clf, mt_train_test_preprocessed["transcription"], 
                                                   mt_train_test_preprocessed["medical_specialty"], cv=5))
                print("Accuracy for {}_{}_{} is {}".format(name,fs_name,j, round(scores_tfidf,3)))    



2434
Accuracy for RF_ELR_0.2 is 0.723
2434
Accuracy for RF_ExT_0.2 is 0.713
2434
Accuracy for RF_V_0.2 is 0.709
2434
Accuracy for RF_Chi2_0.2 is 0.717
2434
Accuracy for RF_L1_0.2 is 0.734
2434


KeyboardInterrupt: 