In [12]:
# install required dependencies
%pip install kagglehub
%pip install pandas
%pip install nltk
%pip install sklearn
%pip install tensorflow
%pip install matplotlib
%pip install tf-keras

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [1]:
# import all the required dependencies
import kagglehub
import random
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from multiprocessing import Pool, cpu_count

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Download dataset
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
print("Path to dataset files:", path)

dataset = pd.read_csv(path + "/mtsamples.csv")
print("Head: ", dataset.head)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tboyle10/medicaltranscriptions?dataset_version_number=1...


100%|██████████| 4.85M/4.85M [00:00<00:00, 201MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/tboyle10/medicaltranscriptions/versions/1





Head:  <bound method NDFrame.head of       Unnamed: 0                                        description  \
0              0   A 23-year-old white female presents with comp...   
1              1           Consult for laparoscopic gastric bypass.   
2              2           Consult for laparoscopic gastric bypass.   
3              3                             2-D M-Mode. Doppler.     
4              4                                 2-D Echocardiogram   
...          ...                                                ...   
4994        4994   Patient having severe sinusitis about two to ...   
4995        4995   This is a 14-month-old baby boy Caucasian who...   
4996        4996   A female for a complete physical and follow u...   
4997        4997   Mother states he has been wheezing and coughing.   
4998        4998   Acute allergic reaction, etiology uncertain, ...   

                medical_specialty                                sample_name  \
0            Allergy / Immunol

## Data processing

- We drop every other column except transcription and medical_specialty.
- We also drop any rows with empty or null transcription or medical_specialty.

- Then we drop all the classes in the excluded specialties list below. We do this as these are general terms and don't specifically map to any specialty.
- We then merge the classes with large overlaps - e.g. Neurosurgery and neurology, Neurosurgery is a subset of neurology.

In [3]:
# Drop rows with missing values in specified columns
dataset.dropna(subset=['transcription', 'medical_specialty'], inplace=True)

# Keep only relevant columns
dataset = dataset[['transcription', 'medical_specialty']]

# # Filter medical specialties with at least 30 occurrences
specialty_counts = dataset['medical_specialty'].value_counts()
# valid_specialties = specialty_counts[specialty_counts >= 50].index
# dataset = dataset[dataset['medical_specialty'].isin(valid_specialties)]

# Strip spaces in 'medical_specialty' column
dataset['medical_specialty'] = dataset['medical_specialty'].str.strip()

# Remove specific categories
excluded_specialties = [
    'Surgery',
    'SOAP / Chart / Progress Notes',
    'Office Notes',
    'Consult - History and Phy.',
    'Emergency Room Reports',
    'Discharge Summary',
    'Pain Management',
    'General Medicine',
    'Radiology',
]

dataset = dataset[~dataset['medical_specialty'].isin(excluded_specialties)]

# Define category mapping to merge similar categories
category_mapping = {
    'Neurosurgery': 'Neurology',
    'Nephrology': 'Urology',
}

# Apply category mapping
dataset['medical_specialty'] = dataset['medical_specialty'].replace(category_mapping)

# Display counts for each category
for i, (category_name, category) in enumerate(dataset.groupby("medical_specialty")):
    print(f"Category {i}: {category_name}: {len(category)}")

Category 0: Allergy / Immunology: 7
Category 1: Autopsy: 8
Category 2: Bariatrics: 18
Category 3: Cardiovascular / Pulmonary: 371
Category 4: Chiropractic: 14
Category 5: Cosmetic / Plastic Surgery: 27
Category 6: Dentistry: 27
Category 7: Dermatology: 29
Category 8: Diets and Nutritions: 10
Category 9: ENT - Otolaryngology: 96
Category 10: Endocrinology: 19
Category 11: Gastroenterology: 224
Category 12: Hematology - Oncology: 90
Category 13: Hospice - Palliative Care: 6
Category 14: IME-QME-Work Comp etc.: 16
Category 15: Lab Medicine - Pathology: 8
Category 16: Letters: 23
Category 17: Neurology: 317
Category 18: Obstetrics / Gynecology: 155
Category 19: Ophthalmology: 83
Category 20: Orthopedic: 355
Category 21: Pediatrics - Neonatal: 70
Category 22: Physical Medicine - Rehab: 21
Category 23: Podiatry: 47
Category 24: Psychiatry / Psychology: 53
Category 25: Rheumatology: 10
Category 26: Sleep Medicine: 20
Category 27: Speech - Language: 9
Category 28: Urology: 237


In [4]:
# Helper: Get synonyms from WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym.lower() != word.lower():
                synonyms.add(synonym)
    return list(synonyms)

# Synonym Replacement with random n
def synonym_replacement(text, n=None):
    words = word_tokenize(text)
    new_words = words.copy()
    eligible_words = list(set([word for word in words if word.isalpha()]))
    random.shuffle(eligible_words)

    if n is None:
        n = random.randint(1, min(3, len(eligible_words)))

    num_replaced = 0
    for word in eligible_words:
        synonyms = get_synonyms(word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if w == word else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# Word Dropout
def word_dropout(text, dropout_prob=0.1):
    words = word_tokenize(text)
    new_words = [word for word in words if random.random() > dropout_prob]
    return ' '.join(new_words) if new_words else text

# Random Swap
def random_swap(text, n=1):
    words = word_tokenize(text)
    if len(words) < 2:
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Apply 1–2 random augmentations
def augment_text_randomly(text):
    aug_functions = [
        synonym_replacement,
        word_dropout,
        random_swap
    ]
    num_augs = random.randint(1, 2)
    selected_augs = random.sample(aug_functions, num_augs)
    for aug in selected_augs:
        text = aug(text)
    return text

# Parallel augmentation per class
def augment_class(label_samples_tuple):
    label, samples_needed, class_samples = label_samples_tuple
    samples = class_samples.sample(n=samples_needed, replace=True)
    texts = samples['transcription'].tolist()

    with Pool(cpu_count()) as p:
        augmented_texts = p.map(augment_text_randomly, texts)

    return pd.DataFrame({
        'transcription': augmented_texts,
        'medical_specialty': label
    })

specialty_counts = dataset['medical_specialty'].value_counts()
max_count = specialty_counts.max()

# Prepare augmentation tasks
tasks = []
for label, count in specialty_counts.items():
    if count < max_count:
        samples_needed = max_count - count
        class_samples = dataset[dataset['medical_specialty'] == label]
        tasks.append((label, samples_needed, class_samples))

# Run augmentations
augmented_dfs = [augment_class(task) for task in tasks]

# Combine datasets
augmented_df = pd.concat(augmented_dfs, ignore_index=True)
dataset = pd.concat([dataset, augmented_df], ignore_index=True)


## Data processing

- We clean the text and then tokenize it and then lemmatize all the words in it.

In [6]:
from sklearn.model_selection import train_test_split

lemmatizer = None
stop_words = None

def init_worker():
    global lemmatizer, stop_words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.strip().lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

def apply_multiprocessing(series, func, workers=None):
    with Pool(processes=workers or cpu_count(), initializer=init_worker) as pool:
        results = pool.map(func, series)
    return results

# Use multiprocessing to speed it up
dataset['processed_transcription'] = apply_multiprocessing(dataset['transcription'], clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['processed_transcription'], dataset['medical_specialty'], test_size=0.2, random_state=42, stratify=dataset['medical_specialty']
)

## Naive Bayes model

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

pipeline = Pipeline([
    ('tfidataset', TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,4), max_df=0.8, use_idf=True, smooth_idf=True, max_features=2000)),
    ('clf', MultinomialNB())  # Train a Naive Bayes classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.8448
                            precision    recall  f1-score   support

      Allergy / Immunology       0.85      1.00      0.92        74
                   Autopsy       1.00      0.95      0.97        74
                Bariatrics       0.92      0.82      0.87        74
Cardiovascular / Pulmonary       0.82      0.80      0.81        74
              Chiropractic       0.68      0.84      0.75        74
Cosmetic / Plastic Surgery       0.51      0.97      0.67        74
                 Dentistry       0.89      0.97      0.93        75
               Dermatology       0.98      0.85      0.91        75
      Diets and Nutritions       0.94      1.00      0.97        74
      ENT - Otolaryngology       0.90      0.64      0.75        74
             Endocrinology       0.92      0.92      0.92        75
          Gastroenterology       0.88      0.72      0.79        74
     Hematology - Oncology       0.84      0.69      0.76        74
 Hospice - Palliative Care    

## Logistic Regression Model

In [8]:
# Implement Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Create a pipeline with TF-IDF and Logistic Regression
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,4),
                             max_df=0.8, use_idf=True, smooth_idf=True, max_features=2000)),
    ('clf', LogisticRegression(max_iter=1000, C=1.0, solver='liblinear', multi_class='ovr'))
])

# Train the model
lr_pipeline.fit(X_train, y_train)

# Make predictions
lr_y_pred = lr_pipeline.predict(X_test)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')
print(classification_report(y_test, lr_y_pred))



Logistic Regression Accuracy: 0.9010
                            precision    recall  f1-score   support

      Allergy / Immunology       0.99      1.00      0.99        74
                   Autopsy       1.00      0.96      0.98        74
                Bariatrics       0.92      0.91      0.91        74
Cardiovascular / Pulmonary       0.83      0.86      0.85        74
              Chiropractic       0.73      0.84      0.78        74
Cosmetic / Plastic Surgery       0.81      0.92      0.86        74
                 Dentistry       0.99      0.97      0.98        75
               Dermatology       0.97      0.95      0.96        75
      Diets and Nutritions       1.00      1.00      1.00        74
      ENT - Otolaryngology       0.90      0.82      0.86        74
             Endocrinology       0.93      1.00      0.96        75
          Gastroenterology       0.91      0.81      0.86        74
     Hematology - Oncology       0.79      0.85      0.82        74
 Hospice -

## CNN model

In [9]:
# %%
# Implement CNN model for text classification
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Parameters
max_features = 50000  # Max vocabulary size
maxlen = 2000  # Max sequence length
embedding_dims = 500  # Embedding dimension
batch_size = 128
epochs = 10

# Convert text to sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert to categorical
y_train_cat = tf.keras.utils.to_categorical(y_train_encoded)
y_test_cat = tf.keras.utils.to_categorical(y_test_encoded)

# Build the CNN model
model = tf.keras.Sequential([
    Embedding(max_features, embedding_dims, input_length=maxlen),
    Dropout(0.2),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(y_train_cat.shape[1], activation='softmax')
])

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Print model summary
model.summary()

# Train the model
history = model.fit(
    X_train_pad,
    y_train_cat,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model
loss, cnn_accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print(f'CNN Model Accuracy: {cnn_accuracy:.4f}')

# Get predictions
y_pred_probs = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)

# Print classification report
print(classification_report(y_test, y_pred_labels))

# Compare with previous models
print(f'Naive Bayes Accuracy: {accuracy:.4f}')
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')
print(f'CNN Accuracy: {cnn_accuracy:.4f}')



Epoch 1/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 381ms/step - accuracy: 0.2143 - loss: 3.1499 - val_accuracy: 0.6940 - val_loss: 1.7550
Epoch 2/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 200ms/step - accuracy: 0.7245 - loss: 1.3087 - val_accuracy: 0.8833 - val_loss: 0.6031
Epoch 3/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 207ms/step - accuracy: 0.8963 - loss: 0.4545 - val_accuracy: 0.8990 - val_loss: 0.4119
Epoch 4/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 209ms/step - accuracy: 0.9273 - loss: 0.2671 - val_accuracy: 0.8995 - val_loss: 0.4189
Epoch 5/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 207ms/step - accuracy: 0.9417 - loss: 0.1947 - val_accuracy: 0.8931 - val_loss: 0.4118
Epoch 6/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 209ms/step - accuracy: 0.9442 - loss: 0.1762 - val_accuracy: 0.8879 - val_loss: 0.4223
Epoch 7/10
[1m54/54[

## Transformers model

In [10]:
from sklearn.preprocessing import LabelEncoder
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from tf_keras.optimizers.legacy import Adam
from tf_keras.losses import SparseCategoricalCrossentropy
from sklearn.metrics import classification_report
import numpy as np
from tf_keras import mixed_precision

mixed_precision.set_global_policy('mixed_float16')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Parameters
max_len = 256  # Max sequence length
batch_size = 32
epochs = 6

# Load a pre-trained BERT model and tokenizer from Hugging Face
model_name = "medicalai/ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_), from_pt=True)

def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors="tf")

X_train_encoded = encode_texts(X_train)
X_test_encoded = encode_texts(X_test)

# Fine-tune the model
transformer_model.compile(optimizer=Adam(learning_rate=2e-5), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Train the model
history_transformer = transformer_model.fit(
    X_train_encoded['input_ids'], y_train_encoded,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model
loss, transformer_accuracy = transformer_model.evaluate(X_test_encoded['input_ids'], y_test_encoded, verbose=0)
print(f'Transformer Model Accuracy: {transformer_accuracy:.4f}')

y_pred_probs_transformer = transformer_model.predict(X_test_encoded['input_ids'])
y_pred_classes_transformer = np.argmax(y_pred_probs_transformer.logits, axis=1)
y_pred_labels_transformer = label_encoder.inverse_transform(y_pred_classes_transformer)

print(classification_report(y_test, y_pred_labels_transformer))

# Compare with previous models
print(f'Naive Bayes Accuracy: {accuracy:.4f}')
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')
print(f'CNN Accuracy: {cnn_accuracy:.4f}')
print(f'Transformer Accuracy: {transformer_accuracy:.4f}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'cla

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Transformer Model Accuracy: 0.8964
                            precision    recall  f1-score   support

      Allergy / Immunology       0.97      1.00      0.99        74
                   Autopsy       1.00      0.99      0.99        74
                Bariatrics       0.89      1.00      0.94        74
Cardiovascular / Pulmonary       0.77      0.89      0.82        74
              Chiropractic       0.80      0.93      0.86        74
Cosmetic / Plastic Surgery       0.91      0.72      0.80        74
                 Dentistry       0.96      1.00      0.98        75
               Dermatology       0.90      0.96      0.93        75
      Diets and Nutritions       1.00      1.00      1.00        74
      ENT - Otolaryngology       0.92      0.78      0.85        74
             Endocrinology       0.94      1.00      0.97        75
          Gastroenterology       0.88      0.85      0.86        74
     Hematology - On