In [None]:
!pip install imbalanced-learn

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("/content/drive/MyDrive/cleaned_data.csv", index_col=0)

In [None]:
df.head()

Unnamed: 0,medical_specialty,transcription,labels,transcription_cleaned_simple
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0,reason visit high pt inr history patient year ...
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1,preoperative diagnosis acetabular fracture lef...
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1,name procedure selective coronary angiography ...
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2,referring diagnosis motor neuron disease perti...
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0,chief complaint dental pain history present il...


In [7]:
# Libraries 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Naive Bayes

In [29]:
# Train-Test Split
Train_X, Test_X, Train_y, Test_y = model_selection.train_test_split(df['transcription_cleaned_simple'], df['labels'].values, test_size=0.3, random_state=69)
Tfidf_vectorizer = TfidfVectorizer(max_features=10000)
Tfidf_vectorizer.fit(df['transcription_cleaned_simple'])
Train_X_vectorized = Tfidf_vectorizer.transform(Train_X)
Test_X_vectorized = Tfidf_vectorizer.transform(Test_X)

In [30]:
# Fit the training dataset on the Multi-NB classifier
MultiNB = naive_bayes.MultinomialNB(alpha=0.5)
MultiNB.fit(Train_X_vectorized, Train_y)
# Predict the labels on validation dataset
pred_y_nb = MultiNB.predict(Test_X_vectorized)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ", accuracy_score(pred_y_nb, Test_y)*100)
print(classification_report(Test_y, pred_y_nb))

Naive Bayes Accuracy Score ->  35.684298908480265
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.42      0.94      0.58       261
           2       0.46      0.19      0.27        70
           3       0.00      0.00      0.00        12
           4       0.36      0.30      0.33        44
           5       0.00      0.00      0.00        53
           6       0.33      0.10      0.15        94
           7       0.44      0.30      0.36        93
           8       0.00      0.00      0.00        20
           9       0.00      0.00      0.00        19
          10       0.00      0.00      0.00        61
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        32
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00        13
          16       0.26      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Oversampled Fitting on training data to mitigate imbalance.

In [60]:
# OverSampling
from imblearn.over_sampling import RandomOverSampler
Train_X, Test_X, Train_y, Test_y = model_selection.train_test_split(df['transcription_cleaned_simple'], df['labels'].values, test_size=0.2, random_state=69)
Tfidf_vectorizer = TfidfVectorizer(max_features=10000)
Tfidf_vectorizer.fit(df['transcription_cleaned_simple'])
Train_X_vectorized = Tfidf_vectorizer.transform(Train_X)
Test_X_vectorized = Tfidf_vectorizer.transform(Test_X)

In [61]:
Train_X_vectorized = Train_X_vectorized.toarray()
Test_X_vectorized = Test_X_vectorized.toarray()

In [62]:
ros = RandomOverSampler()
Train_X_vectorized_ros, Train_y_ros = ros.fit_resample(Train_X_vectorized, Train_y)

In [68]:
MultiNB = naive_bayes.MultinomialNB(alpha=0.5)
MultiNB.fit(Train_X_vectorized_ros, Train_y_ros)
# Predict the labels on validation datase5
pred_y_nb = MultiNB.predict(Test_X_vectorized)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ", accuracy_score(pred_y_nb, Test_y)*100)
print(classification_report(Test_y, pred_y_nb))

Naive Bayes Accuracy Score ->  32.241813602015114
              precision    recall  f1-score   support

           0       0.30      0.43      0.35        14
           1       0.26      0.07      0.11       171
           2       0.33      0.20      0.25        45
           3       0.24      0.56      0.33         9
           4       0.35      0.35      0.35        31
           5       0.40      0.46      0.43        37
           6       0.57      0.48      0.52        63
           7       0.42      0.33      0.37        67
           8       0.07      0.12      0.09         8
           9       0.33      0.83      0.48        12
          10       0.05      0.05      0.05        38
          11       0.20      0.18      0.19        17
          12       0.09      0.25      0.13         4
          13       0.15      0.20      0.17        20
          14       0.00      0.00      0.00         1
          15       0.63      1.00      0.77        12
          16       0.27      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SVM

In [71]:
Train_X, Test_X, Train_y, Test_y = model_selection.train_test_split(df['transcription_cleaned_simple'], df['labels'].values, test_size=0.3, random_state=69)
Tfidf_vectorizer = TfidfVectorizer(max_features=10000)
Tfidf_vectorizer.fit(df['transcription_cleaned_simple'])
Train_X_vectorized = Tfidf_vectorizer.transform(Train_X)
Test_X_vectorized = Tfidf_vectorizer.transform(Test_X)

In [72]:
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_vectorized,Train_y)
# predict the labels on validation dataset
pred_y_svm = SVM.predict(Test_X_vectorized)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(pred_y_svm, Test_y)*100)
print(classification_report(Test_y, pred_y_svm))


SVM Accuracy Score ->  21.91435768261965
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.22      1.00      0.36       261
           2       0.00      0.00      0.00        70
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        44
           5       0.00      0.00      0.00        53
           6       0.00      0.00      0.00        94
           7       0.00      0.00      0.00        93
           8       0.00      0.00      0.00        20
           9       0.00      0.00      0.00        19
          10       0.00      0.00      0.00        61
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        32
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00        13
          16       0.00      0.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression

In [74]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(Train_X_vectorized,Train_y)
# predict the labels on validation dataset
pred_y_logreg = logreg.predict(Test_X_vectorized)


print('accuracy %s' % accuracy_score(pred_y_logreg, Test_y))
print(classification_report(Test_y, pred_y_logreg))

accuracy 0.3022670025188917
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.38      0.68      0.48       261
           2       0.35      0.34      0.35        70
           3       0.00      0.00      0.00        12
           4       0.18      0.16      0.17        44
           5       0.24      0.11      0.15        53
           6       0.20      0.16      0.18        94
           7       0.29      0.28      0.29        93
           8       0.00      0.00      0.00        20
           9       0.33      0.11      0.16        19
          10       0.13      0.08      0.10        61
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00         7
          13       0.14      0.12      0.13        32
          14       0.00      0.00      0.00         3
          15       0.20      0.08      0.11        13
          16       0.28      0.58      0.38       122

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
