In [1]:
# install required dependencies
%pip install kagglehub
%pip install pandas
%pip install nltk
%pip install sklearn
%pip install tensorflow
%pip install matplotlib
%pip install tf-keras
%pip install imbalanced-learn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [2]:
# import all the required dependencies
import kagglehub
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Download dataset
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
print("Path to dataset files:", path)

dataset = pd.read_csv(path + "/mtsamples.csv")
print("Head: ", dataset.head)

Path to dataset files: /kaggle/input/medicaltranscriptions
Head:  <bound method NDFrame.head of       Unnamed: 0                                        description  \
0              0   A 23-year-old white female presents with comp...   
1              1           Consult for laparoscopic gastric bypass.   
2              2           Consult for laparoscopic gastric bypass.   
3              3                             2-D M-Mode. Doppler.     
4              4                                 2-D Echocardiogram   
...          ...                                                ...   
4994        4994   Patient having severe sinusitis about two to ...   
4995        4995   This is a 14-month-old baby boy Caucasian who...   
4996        4996   A female for a complete physical and follow u...   
4997        4997   Mother states he has been wheezing and coughing.   
4998        4998   Acute allergic reaction, etiology uncertain, ...   

                medical_specialty                  

In [4]:
# Drop rows with missing values in specified columns
dataset.dropna(subset=['transcription', 'medical_specialty'], inplace=True)

# Keep only relevant columns
dataset = dataset[['transcription', 'medical_specialty']]

# Filter medical specialties with at least 50 occurrences
specialty_counts = dataset['medical_specialty'].value_counts()
valid_specialties = specialty_counts[specialty_counts >= 50].index
dataset = dataset[dataset['medical_specialty'].isin(valid_specialties)]

# Strip spaces in 'medical_specialty' column
dataset['medical_specialty'] = dataset['medical_specialty'].str.strip()

# Remove specific categories
excluded_specialties = [
    'Surgery',
    'SOAP / Chart / Progress Notes',
    'Office Notes',
    'Consult - History and Phy.',
    'Emergency Room Reports',
    'Discharge Summary',
    'Pain Management',
    'General Medicine',
    'Radiology',
]

dataset = dataset[~dataset['medical_specialty'].isin(excluded_specialties)]

# Define category mapping to merge similar categories
category_mapping = {
    'Neurosurgery': 'Neurology',
    'Nephrology': 'Urology',
}

# Apply category mapping
dataset['medical_specialty'] = dataset['medical_specialty'].replace(category_mapping)

# Display counts for each category
for i, (category_name, category) in enumerate(dataset.groupby("medical_specialty")):
    print(f"Category {i}: {category_name}: {len(category)}")

Category 0: Cardiovascular / Pulmonary: 371
Category 1: ENT - Otolaryngology: 96
Category 2: Gastroenterology: 224
Category 3: Hematology - Oncology: 90
Category 4: Neurology: 317
Category 5: Obstetrics / Gynecology: 155
Category 6: Ophthalmology: 83
Category 7: Orthopedic: 355
Category 8: Pediatrics - Neonatal: 70
Category 9: Psychiatry / Psychology: 53
Category 10: Urology: 237


In [5]:
from sklearn.model_selection import train_test_split

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.strip()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stopwords.words('english')])
    return text

dataset['processed_transcription'] = dataset['transcription'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['processed_transcription'], dataset['medical_specialty'], test_size=0.2, random_state=42, stratify=dataset['medical_specialty']
)

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

rf_pipeline = Pipeline([('tfidf', TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 5), max_df=0.7, max_features=2000)),
                        ('clf', RandomForestClassifier(n_estimators=150, random_state=30))])

rf_pipeline.fit(X_train, y_train)

rf_y_pred = rf_pipeline.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')
print(classification_report(y_test, rf_y_pred))

Random Forest Accuracy: 0.7470
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.74      0.86      0.80        74
      ENT - Otolaryngology       0.88      0.79      0.83        19
          Gastroenterology       0.76      0.87      0.81        45
     Hematology - Oncology       0.50      0.11      0.18        18
                 Neurology       0.69      0.64      0.67        64
   Obstetrics / Gynecology       0.84      0.84      0.84        31
             Ophthalmology       0.84      0.94      0.89        17
                Orthopedic       0.68      0.80      0.74        71
     Pediatrics - Neonatal       0.50      0.36      0.42        14
   Psychiatry / Psychology       0.71      0.45      0.56        11
                   Urology       0.86      0.79      0.82        47

                  accuracy                           0.75       411
                 macro avg       0.73      0.68      0.69       411
              w

# Bi-LSTM

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import numpy as np

max_words = 50000
max_len = 2000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

y_train_cat = tf.keras.utils.to_categorical(y_train_enc)
y_test_cat = tf.keras.utils.to_categorical(y_test_enc)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, GlobalMaxPooling1D, LayerNormalization

model = Sequential([Embedding(input_dim=max_words, output_dim=128, input_length=max_len), Bidirectional(LSTM(64, return_sequences=True, dropout=0.3)),
                    LayerNormalization(), GlobalMaxPooling1D(), Dropout(0.4),
                    Dense(64, activation='relu'), Dropout(0.4), Dense(y_train_cat.shape[1], activation='softmax')])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [9]:
history = model.fit(X_train_pad, y_train_cat, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 159ms/step - accuracy: 0.1227 - loss: 2.6762 - val_accuracy: 0.1159 - val_loss: 2.2922
Epoch 2/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 146ms/step - accuracy: 0.1756 - loss: 2.2515 - val_accuracy: 0.1890 - val_loss: 2.1731
Epoch 3/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 146ms/step - accuracy: 0.2734 - loss: 2.0866 - val_accuracy: 0.3537 - val_loss: 1.9931
Epoch 4/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 146ms/step - accuracy: 0.3569 - loss: 1.8814 - val_accuracy: 0.4360 - val_loss: 1.7582
Epoch 5/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 145ms/step - accuracy: 0.4751 - loss: 1.5888 - val_accuracy: 0.5183 - val_loss: 1.4772
Epoch 6/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 146ms/step - accuracy: 0.5531 - loss: 1.3687 - val_accuracy: 0.5640 - val_loss: 1.2929
Epoch 7/20
[1m41/41[0m [

In [10]:
loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print("BiLSTM Accuracy (test): ", accuracy)

y_pred_probs = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

y_true_classes = np.argmax(y_test_cat, axis=1)

# get original string names
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

from sklearn.metrics import classification_report
print(classification_report(y_true_labels, y_pred_labels))

BiLSTM Accuracy (test):  0.7493917346000671
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 86ms/step
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.82      0.82      0.82        74
      ENT - Otolaryngology       0.92      0.63      0.75        19
          Gastroenterology       0.81      0.84      0.83        45
     Hematology - Oncology       0.44      0.22      0.30        18
                 Neurology       0.70      0.69      0.69        64
   Obstetrics / Gynecology       0.82      0.90      0.86        31
             Ophthalmology       0.84      0.94      0.89        17
                Orthopedic       0.75      0.85      0.79        71
     Pediatrics - Neonatal       0.29      0.14      0.19        14
   Psychiatry / Psychology       0.64      0.64      0.64        11
                   Urology       0.76      0.87      0.81        47

                  accuracy                           0.76       

# GRU

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Bidirectional, Dense, Dropout
from tensorflow.keras.layers import LayerNormalization, GlobalMaxPooling1D

model = Sequential([Embedding(max_words, 128, input_length=max_len), Bidirectional(GRU(64, return_sequences=True)),
    LayerNormalization(), GlobalMaxPooling1D(), Dropout(0.4),
    Dense(64, activation='relu'), Dropout(0.4), Dense(y_train_cat.shape[1], activation='softmax')])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [12]:
history = model.fit(X_train_pad, y_train_cat, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 153ms/step - accuracy: 0.1260 - loss: 3.0601 - val_accuracy: 0.2744 - val_loss: 2.2469
Epoch 2/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 145ms/step - accuracy: 0.2469 - loss: 2.1942 - val_accuracy: 0.3628 - val_loss: 2.0314
Epoch 3/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 143ms/step - accuracy: 0.3052 - loss: 2.0082 - val_accuracy: 0.5213 - val_loss: 1.8194
Epoch 4/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 144ms/step - accuracy: 0.4499 - loss: 1.6881 - val_accuracy: 0.5671 - val_loss: 1.5807
Epoch 5/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 144ms/step - accuracy: 0.5008 - loss: 1.4438 - val_accuracy: 0.5915 - val_loss: 1.3958
Epoch 6/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 144ms/step - accuracy: 0.5951 - loss: 1.2286 - val_accuracy: 0.6128 - val_loss: 1.2678
Epoch 7/20
[1m41/41[0m [3

In [13]:
loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print("GRU Accuracy (test): ", accuracy)

y_pred_probs = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test_cat, axis=1)

# Decode to string labels
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

from sklearn.metrics import classification_report
print(classification_report(y_true_labels, y_pred_labels))

GRU Accuracy (test):  0.7591241002082825
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.83      0.80      0.81        74
      ENT - Otolaryngology       0.88      0.74      0.80        19
          Gastroenterology       0.71      0.82      0.76        45
     Hematology - Oncology       0.75      0.33      0.46        18
                 Neurology       0.68      0.66      0.67        64
   Obstetrics / Gynecology       0.84      0.87      0.86        31
             Ophthalmology       0.81      1.00      0.89        17
                Orthopedic       0.77      0.79      0.78        71
     Pediatrics - Neonatal       0.38      0.36      0.37        14
   Psychiatry / Psychology       0.88      0.64      0.74        11
                   Urology       0.76      0.89      0.82        47

                  accuracy                           0.76       411