In [None]:
# install required dependencies
%pip install kagglehub
%pip install pandas
%pip install nltk

In [None]:
# import all the required dependencies
import kagglehub
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

In [None]:
# Download dataset
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
print("Path to dataset files:", path)

dataset = pd.read_csv(path + "/mtsamples.csv")
print("Head: ", dataset.head)

In [None]:
# data llmlmlmlmlmlm

dataset.dropna(subset=['transcription', 'medical_specialty'], inplace=True)

def clean_text(text):
    text = text.strip()
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

dataset['transcription'] = dataset['transcription'].apply(clean_text)
dataset['medical_specialty'] = dataset['medical_specialty'].apply(clean_text)

In [None]:
categories = dataset.groupby(dataset["medical_specialty"])

for i, cat in enumerate(categories):
    category_name, category = cat
    print(f"Category {i}: {category_name}: {len(category)}")

print(categories.get_group('bariatrics'))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    dataset['transcription'], dataset['medical_specialty'], test_size=0.2, random_state=42, stratify=dataset['medical_specialty']
)

pipeline = Pipeline([
    ('tfidaataset', TfidatasetVectorizer(max_features=5000)),  # Convert text to TF-Idaataset features
    ('clf', MultinomialNB())  # Train a Naive Bayes classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, BatchNormalization, Dropout

label_encoder = LabelEncoder()
dataset['encoded_label'] = label_encoder.fit_transform(dataset['medical_specialty'])
dataset.dropna(subset=['keywords'], inplace=True)

def extract_keywords(text):
    kw_extractor = yake.KeywordExtractor(n=1, top=10)  # Extract top 10 single-word keywords
    keywords = kw_extractor.extract_keywords(text)
    return ' '.join([kw[0] for kw in keywords])  # Return keywords as a space-separated string
# Apply keyword extraction
# dataset['new_keywords'] = dataset['transcription'].apply(extract_keywords)

dataset['keywords'] = dataset['keywords'].apply(lambda x: ' '.join(x))
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(
    dataset['keywords'], dataset['encoded_label'], test_size=0.1, random_state=42, stratify=dataset['encoded_label']
)

max_words = 20000  # Increased vocabulary size
max_len = 300  # Increased sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)
# Build improved CNN model

model = Sequential([
    Embedding(input_dim=max_words, output_dim=256, input_length=max_len),
    Conv1D(256, 5, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    GlobalMaxPooling1D(),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model
model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_test_seq, y_test))
# Evaluate model
loss, accuracy = model.evaluate(X_test_seq, y_test)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# Combine transcription and sample name for keyword extraction
dataset['combined_text'] = dataset['cleaned_transcription'] + " " + dataset['cleaned_sample_name']

# YAKE Keyword Extraction
def extract_keywords(text):
    kw_extractor = yake.KeywordExtractor(n=2, top=10)  # Extract top 10 bi-gram keywords
    keywords = kw_extractor.extract_keywords(text)
    return ' '.join([kw[0] for kw in keywords])  # Return keywords as a space-separated string

# Apply YAKE for keyword extraction
dataset['keywords'] = dataset['combined_text'].apply(extract_keywords)
# Encode labels
label_encoder = LabelEncoder()
dataset['encoded_label'] = label_encoder.fit_transform(dataset['medical_specialty'])
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(
    dataset['keywords'], dataset['encoded_label'], test_size=0.2, random_state=42, stratify=dataset['encoded_label']
)

# Tokenization
max_words = 20000  # Increased vocabulary size
max_len = 50  # Reduced sequence length since keywords are shorter
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

# Build improved CNN model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=256, input_length=max_len),
    Conv1D(128, 5, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model
model.fit(X_train_seq, y_train, epochs=10, batch_size=128, validation_data=(X_test_seq, y_test))
# Evaluate model
loss, accuracy = model.evaluate(X_test_seq, y_test)
print(f'Accuracy: {accuracy:.4f}')

In [None]:

# Tokenization
max_words = 20000  # Increased vocabulary size
max_len = 500  # Reduced sequence length since keywords are shorter
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

# Build improved CNN model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=256, input_length=max_len),
    Conv1D(128, 5, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model
model.fit(X_train_seq, y_train, epochs=20, batch_size=128, validation_data=(X_test_seq, y_test))
# Evaluate model
loss, accuracy = model.evaluate(X_test_seq, y_test)
print(f'Accuracy: {accuracy:.4f}')