# Intact Medical Specialty Classification Model using NLP

### By: Daniyal, Hibah, Abhishek and Adam

### Step 1: Import libraries and read in the data

We'll add more libraries, as we move on.

In [84]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
import nltk
import spacy
nltk.download('wordnet')
nltk.download('omw-1.4')

df = pd.read_csv("new_train.csv", index_col=0)
print("Test size: ", len(df))
df # print

[nltk_data] Downloading package wordnet to /Users/adamyeo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/adamyeo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Test size:  3969


Unnamed: 0,medical_specialty,transcription,labels
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0
...,...,...,...
3995,Neurology,"PROBLEMS AND ISSUES:,1. Headaches, nausea, an...",4
3996,Surgery,"PREOPERATIVE DIAGNOSIS: , Anemia.,PROCEDURE:, ...",1
3997,Surgery,"1. Odynophagia.,2. Dysphagia.,3. Gastroesop...",1
3998,Gastroenterology,The patient's abdomen was prepped and draped i...,5


### Step 2: Pre-process our data

I think this is the most important step here, the ML model is only as good as its dataset, so we gotta make sure it's squeaky clean.

All of the basic pre-processing is done by the CountVectorizer, these tasks include:
- Tokenize (divide words individually)
- Remove stop-words (remove "the, and, to, or, ..."; other special characters)
- Lemmatize (convert similar words into its base root; eating, eats, ate => eat)

In [51]:
# Create labels/target values
y = df.labels
print("Label size: ", len(y))
y

Label size:  3969


0       0
1       1
2       1
3       2
4       0
       ..
3995    4
3996    1
3997    1
3998    5
3999    1
Name: labels, Length: 3969, dtype: int64

In [72]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["transcription"], y, test_size=0.1, random_state=53)

# X_train: training data of features
print("X_train size: ", len(X_train))
# y_train: training data of label
print("y_train size: ", len(y_train))

# X_test: test data of features
print("X_test size: ", len(X_test))
# y_test: test data of label
print("y_test size: ", len(y_test))

# X_train
# y_train[:50]
# X_test
# y_test

X_train size:  3572
y_train size:  3572
X_test size:  397
y_test size:  397


In [99]:
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# we could try stemming as well

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
all_stopwords = nlp.Defaults.stop_words
My_text = "strawberries"
doc = nlp(My_text)
print(doc)

# Custom pre-processing function
def preprocess_data(text):
    text = text.lower()
    text = re.sub(r'\d+|_', '', text) # removes words with digits and '_'
    wordnet_lemmatizer.lemmatize(text)
    return text

# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words="english", preprocessor=preprocess_data) # work on more pre-processing

print(wordnet_lemmatizer.lemmatize("strawberries"))

strawberries
strawberry


### Step 3: Fit and Transform the Data

Specifically, we must fit AND transform the feature training data and only transform the feature test data.
This is a preliminary step.

In fit_transform(), what happens is that we calculate the mean and variance of the training data and standardize the entire dataset (hence, transform). We only need transform() for the test data because we are using the mean and variance of the training data to standardize the test data.

In [100]:
# Fit and transform the TRAINING data using only the 'transciption' column values
count_train = count_vectorizer.fit_transform(X_train.values)
# Transform the TEST data using only the 'transciption' column values
count_test = count_vectorizer.transform(X_test.values)


# Print number of words processing
print(len(count_vectorizer.get_feature_names_out()))
# Print the features (individual tokens) of the count_vectorizer
print(count_vectorizer.get_feature_names_out()[:500])

20049
['aa' 'ab' 'abadeedleedlebadle' 'abandoned' 'abandonment' 'abated'
 'abbott' 'abbreviated' 'abc' 'abcd' 'abcg' 'abciximab' 'abd' 'abdomen'
 'abdominal' 'abdominally' 'abdominis' 'abdominopelvic' 'abdominoplasty'
 'abdominosacrocolpopexy' 'abdominus' 'abds' 'abduct' 'abducted'
 'abduction' 'abducto' 'abductor' 'abductors' 'abductovalgus' 'abductus'
 'aberrant' 'aberration' 'abf' 'abg' 'abgs' 'abilify' 'abilities'
 'ability' 'ablate' 'ablated' 'ablation' 'ablative' 'able' 'abnormal'
 'abnormalities' 'abnormality' 'abnormally' 'abnormities' 'abo' 'abolish'
 'abort' 'aborted' 'abortion' 'abortions' 'abortive' 'abovementioned'
 'abraded' 'abrading' 'abraham' 'abrasion' 'abrasions' 'abraxane'
 'abreast' 'abrogated' 'abrogation' 'abrupt' 'abruptio' 'abruptly' 'abs'
 'abscess' 'abscessed' 'abscesses' 'absence' 'absent' 'absolute'
 'absolutely' 'absorb' 'absorbable' 'absorbables' 'absorbing' 'absorption'
 'absorptive' 'abstain' 'abstinence' 'abstract' 'abstraction'
 'abstractions' 'abunda

### Step 4: Train our models here

We used the Multinomial Naive Bayes to classify our labels

In [78]:
# Instantiate a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB()
# Fit the classifier to the training data
nb_clf.fit(count_train, y_train)
# Create the predicted tags
pred = nb_clf.predict(count_test)

# Print the predictions for each row of the dataset (1001 rows)
print(pred[:100])

[30 16  9  6 20 18 15 13  6 10 18  7  6  7 20 16  1  1 21  1 15  6 16 27
  5 16 16  7  1  5 21 16  4 16 10  5 18  6  4  7  1 16  5  1  7  1  1  5
  6 13  1  1  1  2  1  1  9 20  7  7 18  6 16  6  2 10 21  1 10 27  1  9
  6 13  2 10  4  5 18 16  7  1 16  7 19  8  6  1  4  6 16  4 16 16  9 16
  2 16 16  7]


### Step 4: Evaluate the model

We will create an accuracy score and also a confusion matrix.

In [79]:
# Calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
# Calculate the confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(score)
print(conf_matrix)
print(classification_report(y_test, pred))

0.2972292191435768
[[ 0  0  0 ...  0  0  0]
 [ 0 27  0 ...  1  0  0]
 [ 0  0  5 ...  1  0  0]
 ...
 [ 0  1  0 ...  1  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.42      0.28      0.33        97
           2       0.28      0.20      0.23        25
           3       0.00      0.00      0.00         3
           4       0.33      0.32      0.32        19
           5       0.31      0.24      0.27        17
           6       0.22      0.29      0.25        24
           7       0.35      0.47      0.40        30
           8       0.00      0.00      0.00         5
           9       0.40      0.67      0.50         6
          10       0.13      0.24      0.17        17
          11       0.00      0.00      0.00         7
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00        12
          14  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
