# Intact Medical Specialty Classification Model using NLP

### By: Daniyal, Hibah, Abhishek and Adam

### Step 1: Import libraries and read in the data

We'll add more libraries, as we move on.

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import _stop_words
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import spacy
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

df = pd.read_csv("new_train.csv", index_col=0)
print("Test size: ", len(df))
df # print

: 

### Step 2: Pre-process our data

I think this is the most important step here, the ML model is only as good as its dataset, so we gotta make sure it's squeaky clean.

All of the basic pre-processing is done by the CountVectorizer, these tasks include:
- Tokenize (divide words individually)
- Remove stop-words (remove "the, and, to, or, ..."; other special characters)
- Lemmatize (convert similar words into its base root; eating, eats, ate => eat)

In [None]:
# Create labels/target values
y = df.labels
print("Label size: ", len(y))
y

: 

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["transcription"], y, test_size=0.1, random_state=53)

# X_train: training data of features
print("X_train size: ", len(X_train))
# y_train: training data of label
print("y_train size: ", len(y_train))

# X_test: test data of features
print("X_test size: ", len(X_test))
# y_test: test data of label
print("y_test size: ", len(y_test))

# X_train
# y_train[:50]
# X_test
# y_test

: 

In [None]:
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# we could try stemming as well

with open("words_alpha.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)
with open("medical_terms.txt") as word_file:
    medical_words = set(word.strip().lower() for word in word_file)
def is_english_word(word):
    return ((word.lower() in english_words) or (word.lower() in medical_words))

# Custom pre-processing function
def preprocess_data(text):
    if not is_english_word(text):
        return ''
    text = text.lower()
    text = re.sub(r'\d+|_', '', text) # removes words with digits and '_'
    text = wordnet_lemmatizer.lemmatize(text)
    return text


# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words="english", preprocessor=preprocess_data) # work on more pre-processing

print(wordnet_lemmatizer.lemmatize("strawberries"))

: 

### Step 3: Fit and Transform the Data

Specifically, we must fit AND transform the feature training data and only transform the feature test data.
This is a preliminary step.

In fit_transform(), what happens is that we calculate the mean and variance of the training data and standardize the entire dataset (hence, transform). We only need transform() for the test data because we are using the mean and variance of the training data to standardize the test data.

In [None]:
# Fit and transform the TRAINING data using only the 'transciption' column values
count_train = count_vectorizer.fit_transform(X_train.values)
# Transform the TEST data using only the 'transciption' column values
count_test = count_vectorizer.transform(X_test.values)


# Print number of words processing
print(len(count_vectorizer.get_feature_names_out()))
# Print the features (individual tokens) of the count_vectorizer
print(count_vectorizer.get_feature_names_out()[:500])

: 

### Step 4: Train our models here

We used the Multinomial Naive Bayes to classify our labels

In [None]:
# Instantiate a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB()
# Fit the classifier to the training data
nb_clf.fit(count_train, y_train)
# Create the predicted tags
pred = nb_clf.predict(count_test)

# Print the predictions for each row of the dataset (1001 rows)
print(pred[:100])

: 

### Step 4: Evaluate the model

We will create an accuracy score and also a confusion matrix.

In [None]:
# Calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
# Calculate the confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, pred)

print(score)
print(conf_matrix)
print(classification_report(y_test, pred))

: 

: 