In [1]:
# install required dependencies
%pip install kagglehub
%pip install pandas
%pip install nltk
%pip install sklearn
%pip install tensorflow
%pip install matplotlib
%pip install tf-keras
%pip install imbalanced-learn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [2]:
# import all the required dependencies
import kagglehub
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Download dataset
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
print("Path to dataset files:", path)

dataset = pd.read_csv(path + "/mtsamples.csv")
print("Head: ", dataset.head)

Path to dataset files: /kaggle/input/medicaltranscriptions
Head:  <bound method NDFrame.head of       Unnamed: 0                                        description  \
0              0   A 23-year-old white female presents with comp...   
1              1           Consult for laparoscopic gastric bypass.   
2              2           Consult for laparoscopic gastric bypass.   
3              3                             2-D M-Mode. Doppler.     
4              4                                 2-D Echocardiogram   
...          ...                                                ...   
4994        4994   Patient having severe sinusitis about two to ...   
4995        4995   This is a 14-month-old baby boy Caucasian who...   
4996        4996   A female for a complete physical and follow u...   
4997        4997   Mother states he has been wheezing and coughing.   
4998        4998   Acute allergic reaction, etiology uncertain, ...   

                medical_specialty                  

In [4]:
# Drop rows with missing values in specified columns
dataset.dropna(subset=['transcription', 'medical_specialty'], inplace=True)

# Keep only relevant columns
dataset = dataset[['transcription', 'medical_specialty']]

# Filter medical specialties with at least 30 occurrences
specialty_counts = dataset['medical_specialty'].value_counts()
valid_specialties = specialty_counts[specialty_counts >= 30].index
dataset = dataset[dataset['medical_specialty'].isin(valid_specialties)]

# Strip spaces in 'medical_specialty' column
dataset['medical_specialty'] = dataset['medical_specialty'].str.strip()

# Remove specific categories
excluded_specialties = [
    'Surgery',
    'SOAP / Chart / Progress Notes',
    'Office Notes',
    'Consult - History and Phy.',
    'Emergency Room Reports',
    'Discharge Summary',
    'Pain Management',
    'General Medicine',
    'Radiology',
]

dataset = dataset[~dataset['medical_specialty'].isin(excluded_specialties)]

# Define category mapping to merge similar categories
category_mapping = {
    'Neurosurgery': 'Neurology',
    'Nephrology': 'Urology',
}

# Apply category mapping
dataset['medical_specialty'] = dataset['medical_specialty'].replace(category_mapping)

# Display counts for each category
for i, (category_name, category) in enumerate(dataset.groupby("medical_specialty")):
    print(f"Category {i}: {category_name}: {len(category)}")

Category 0: Cardiovascular / Pulmonary: 371
Category 1: ENT - Otolaryngology: 96
Category 2: Gastroenterology: 224
Category 3: Hematology - Oncology: 90
Category 4: Neurology: 317
Category 5: Obstetrics / Gynecology: 155
Category 6: Ophthalmology: 83
Category 7: Orthopedic: 355
Category 8: Pediatrics - Neonatal: 70
Category 9: Podiatry: 47
Category 10: Psychiatry / Psychology: 53
Category 11: Urology: 237


In [5]:
from sklearn.model_selection import train_test_split

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.strip()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stopwords.words('english')])
    return text

dataset['processed_transcription'] = dataset['transcription'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['processed_transcription'], dataset['medical_specialty'], test_size=0.2, random_state=42, stratify=dataset['medical_specialty']
)

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import numpy as np

max_words = 50000
max_len = 1000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

y_train_cat = tf.keras.utils.to_categorical(y_train_enc)
y_test_cat = tf.keras.utils.to_categorical(y_test_enc)


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Bidirectional, Dense, Dropout

model = Sequential([Embedding(max_words, 128, input_length=max_len),Bidirectional(GRU(64, return_sequences=False)), Dropout(0.5),
    Dense(64, activation='relu'), Dropout(0.5), Dense(y_train_cat.shape[1], activation='softmax')])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [8]:
history = model.fit(X_train_pad, y_train_cat, epochs=25, batch_size=64, validation_split=0.2, verbose=1)


Epoch 1/25
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 101ms/step - accuracy: 0.1288 - loss: 2.4531 - val_accuracy: 0.1696 - val_loss: 2.3469
Epoch 2/25
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - accuracy: 0.1692 - loss: 2.3365 - val_accuracy: 0.1875 - val_loss: 2.3068
Epoch 3/25
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - accuracy: 0.1834 - loss: 2.3055 - val_accuracy: 0.2232 - val_loss: 2.2875
Epoch 4/25
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 85ms/step - accuracy: 0.2239 - loss: 2.2257 - val_accuracy: 0.2054 - val_loss: 2.5752
Epoch 5/25
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 83ms/step - accuracy: 0.3263 - loss: 1.9816 - val_accuracy: 0.2768 - val_loss: 2.1000
Epoch 6/25
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - accuracy: 0.3831 - loss: 1.7448 - val_accuracy: 0.3363 - val_loss: 2.0300
Epoch 7/25
[1m21/21[0m [32m━━━

In [9]:
loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print("GRU Accuracy (test): ", accuracy)

y_pred_probs = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test_cat, axis=1)

# Decode to string labels
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

from sklearn.metrics import classification_report
print(classification_report(y_true_labels, y_pred_labels))

GRU Accuracy (test):  0.4761904776096344
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.58      0.77      0.66        74
      ENT - Otolaryngology       0.40      0.32      0.35        19
          Gastroenterology       0.47      0.47      0.47        45
     Hematology - Oncology       0.23      0.17      0.19        18
                 Neurology       0.44      0.47      0.45        64
   Obstetrics / Gynecology       0.41      0.42      0.41        31
             Ophthalmology       0.48      0.65      0.55        17
                Orthopedic       0.51      0.48      0.49        71
     Pediatrics - Neonatal       0.00      0.00      0.00        14
                  Podiatry       0.00      0.00      0.00         9
   Psychiatry / Psychology       0.20      0.09      0.12        11
                   Urology       0.50      0.51      0.51        47


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
