In [None]:
%pip install kagglehub
%pip install pandas
%pip install nltk
%pip install sklearn
%pip install tensorflow
%pip install matplotlib
%pip install tf-keras
%pip install imbalanced-learn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import kagglehub
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
dataset = pd.read_csv("/content/mtsamples.csv")
print("Head: ", dataset.head)

Head:  <bound method NDFrame.head of       Unnamed: 0                                        description  \
0              0   A 23-year-old white female presents with comp...   
1              1           Consult for laparoscopic gastric bypass.   
2              2           Consult for laparoscopic gastric bypass.   
3              3                             2-D M-Mode. Doppler.     
4              4                                 2-D Echocardiogram   
...          ...                                                ...   
4994        4994   Patient having severe sinusitis about two to ...   
4995        4995   This is a 14-month-old baby boy Caucasian who...   
4996        4996   A female for a complete physical and follow u...   
4997        4997   Mother states he has been wheezing and coughing.   
4998        4998   Acute allergic reaction, etiology uncertain, ...   

                medical_specialty                                sample_name  \
0            Allergy / Immunol

## Data processing

- We drop every other column except transcription and medical_specialty.
- We also drop any rows with empty or null transcription or medical_specialty.

- Then we drop all the classes in the excluded specialties list below. We do this as these are general terms and don't specifically map to any specialty.
- We then merge the classes with large overlaps - e.g. Neurosurgery and neurology, Neurosurgery is a subset of neurology.

In [None]:

dataset.dropna(subset=['transcription', 'medical_specialty'], inplace=True)

dataset = dataset[['transcription', 'medical_specialty']]

specialty_counts = dataset['medical_specialty'].value_counts()
valid_specialties = specialty_counts[specialty_counts >= 30].index
dataset = dataset[dataset['medical_specialty'].isin(valid_specialties)]

dataset['medical_specialty'] = dataset['medical_specialty'].str.strip()

excluded_specialties = [
    'Surgery',
    'SOAP / Chart / Progress Notes',
    'Office Notes',
    'Consult - History and Phy.',
    'Emergency Room Reports',
    'Discharge Summary',
    'Pain Management',
    'General Medicine',
    'Radiology',
]

dataset = dataset[~dataset['medical_specialty'].isin(excluded_specialties)]

category_mapping = {
    'Neurosurgery': 'Neurology',
    'Nephrology': 'Urology',
}

dataset['medical_specialty'] = dataset['medical_specialty'].replace(category_mapping)

for i, (category_name, category) in enumerate(dataset.groupby("medical_specialty")):
    print(f"Category {i}: {category_name}: {len(category)}")


Category 0: Cardiovascular / Pulmonary: 371
Category 1: ENT - Otolaryngology: 96
Category 2: Gastroenterology: 224
Category 3: Hematology - Oncology: 90
Category 4: Neurology: 317
Category 5: Obstetrics / Gynecology: 155
Category 6: Ophthalmology: 83
Category 7: Orthopedic: 355
Category 8: Pediatrics - Neonatal: 70
Category 9: Podiatry: 47
Category 10: Psychiatry / Psychology: 53
Category 11: Urology: 237


## Data processing

- We clean the text and then tokenize it and then lemmatize all the words in it.

In [None]:
from sklearn.model_selection import train_test_split

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.strip()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stopwords.words('english')])
    return text

dataset['processed_transcription'] = dataset['transcription'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['processed_transcription'], dataset['medical_specialty'], test_size=0.2, random_state=42, stratify=dataset['medical_specialty']
)

BERT embedding

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import numpy as np
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
        with torch.no_grad():
            outputs = bert(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y_train)

X_train_bert = get_bert_embeddings(X_train)
X_test_bert = get_bert_embeddings(X_test)

y_train_cat = to_categorical(y_encoded)
y_test_encoded_cat = label_encoder.transform(y_test)
y_test_cat = to_categorical(y_test_encoded_cat)

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(768,)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_bert, y_train_cat, epochs=20, batch_size=32, validation_split=0.2)

y_pred_probs = model.predict(X_test_bert)
y_pred = np.argmax(y_pred_probs, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test_encoded_cat, y_pred, digits=4))

100%|██████████| 1678/1678 [12:24<00:00,  2.25it/s]
100%|██████████| 420/420 [03:09<00:00,  2.22it/s]
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 81ms/step - accuracy: 0.1722 - loss: 2.3542 - val_accuracy: 0.2232 - val_loss: 2.1914
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2916 - loss: 2.1482 - val_accuracy: 0.3036 - val_loss: 2.0525
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3029 - loss: 2.0089 - val_accuracy: 0.3363 - val_loss: 1.8502
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3974 - loss: 1.7995 - val_accuracy: 0.4464 - val_loss: 1.6781
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4284 - loss: 1.6910 - val_accuracy: 0.4613 - val_loss: 1.6167
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4965 - loss: 1.5148 - val_accuracy: 0.4583 - val_loss: 1.5593
Epoch 7/20
[1m42/42[0m [32m━━━━━━━━━

LSTM

In [1]:
%pip install kagglehub
%pip install pandas
%pip install nltk
%pip install sklearn
%pip install tensorflow
%pip install matplotlib
%pip install tf-keras
%pip install imbalanced-learn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [3]:
import kagglehub
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
print("Path to dataset files:", path)

dataset = pd.read_csv(path + "/mtsamples.csv")
print("Head: ", dataset.head)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tboyle10/medicaltranscriptions?dataset_version_number=1...


100%|██████████| 4.85M/4.85M [00:01<00:00, 3.76MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tboyle10/medicaltranscriptions/versions/1
Head:  <bound method NDFrame.head of       Unnamed: 0                                        description  \
0              0   A 23-year-old white female presents with comp...   
1              1           Consult for laparoscopic gastric bypass.   
2              2           Consult for laparoscopic gastric bypass.   
3              3                             2-D M-Mode. Doppler.     
4              4                                 2-D Echocardiogram   
...          ...                                                ...   
4994        4994   Patient having severe sinusitis about two to ...   
4995        4995   This is a 14-month-old baby boy Caucasian who...   
4996        4996   A female for a complete physical and follow u...   
4997        4997   Mother states he has been wheezing and coughing.   
4998        4998   Acute allergic reaction, etiology uncertain, ...   

             

In [5]:
dataset.dropna(subset=['transcription', 'medical_specialty'], inplace=True)

dataset = dataset[['transcription', 'medical_specialty']]

specialty_counts = dataset['medical_specialty'].value_counts()
valid_specialties = specialty_counts[specialty_counts >= 50].index
dataset = dataset[dataset['medical_specialty'].isin(valid_specialties)]

dataset['medical_specialty'] = dataset['medical_specialty'].str.strip()

excluded_specialties = [
    'Surgery',
    'SOAP / Chart / Progress Notes',
    'Office Notes',
    'Consult - History and Phy.',
    'Emergency Room Reports',
    'Discharge Summary',
    'Pain Management',
    'General Medicine',
    'Radiology',
]

dataset = dataset[~dataset['medical_specialty'].isin(excluded_specialties)]

category_mapping = {
    'Neurosurgery': 'Neurology',
    'Nephrology': 'Urology',
}

dataset['medical_specialty'] = dataset['medical_specialty'].replace(category_mapping)

for i, (category_name, category) in enumerate(dataset.groupby("medical_specialty")):
    print(f"Category {i}: {category_name}: {len(category)}")

Category 0: Cardiovascular / Pulmonary: 371
Category 1: ENT - Otolaryngology: 96
Category 2: Gastroenterology: 224
Category 3: Hematology - Oncology: 90
Category 4: Neurology: 317
Category 5: Obstetrics / Gynecology: 155
Category 6: Ophthalmology: 83
Category 7: Orthopedic: 355
Category 8: Pediatrics - Neonatal: 70
Category 9: Psychiatry / Psychology: 53
Category 10: Urology: 237


In [6]:
from sklearn.model_selection import train_test_split

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.strip()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stopwords.words('english')])
    return text

dataset['processed_transcription'] = dataset['transcription'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['processed_transcription'], dataset['medical_specialty'], test_size=0.2, random_state=42, stratify=dataset['medical_specialty']
)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPooling1D, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf
# Import Tokenizer from tensorflow.keras.preprocessing.text
from tensorflow.keras.preprocessing.text import Tokenizer

max_words = 50000
max_len = 2000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

y_train_cat = tf.keras.utils.to_categorical(y_train_enc)
y_test_cat = tf.keras.utils.to_categorical(y_test_enc)

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(64, return_sequences=True, dropout=0.3))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(y_train_cat.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train_pad, y_train_cat, epochs=70, batch_size=32, validation_split=0.2, verbose=1)

y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)

print("Classification Report:")
from sklearn.metrics import classification_report
print(classification_report(y_test_enc, y_pred, digits=4))

Epoch 1/70




[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 81ms/step - accuracy: 0.1438 - loss: 2.3555 - val_accuracy: 0.1159 - val_loss: 2.2748
Epoch 2/70
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 69ms/step - accuracy: 0.1868 - loss: 2.2242 - val_accuracy: 0.1159 - val_loss: 2.2551
Epoch 3/70
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 70ms/step - accuracy: 0.2248 - loss: 2.1804 - val_accuracy: 0.1616 - val_loss: 2.1489
Epoch 4/70
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 68ms/step - accuracy: 0.2779 - loss: 1.9747 - val_accuracy: 0.3079 - val_loss: 1.9448
Epoch 5/70
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 74ms/step - accuracy: 0.3524 - loss: 1.7451 - val_accuracy: 0.3476 - val_loss: 1.7133
Epoch 6/70
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 68ms/step - accuracy: 0.4875 - loss: 1.5303 - val_accuracy: 0.5061 - val_loss: 1.5092
Epoch 7/70
[1m41/41[0m [32m━━━━━━━━━━━━━━

In [10]:
model.save('model.h5')

