In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import random
random.seed(42)

from nltk.corpus import stopwords
import nltk
import numpy as np
import re

In [None]:
df = pd.read_csv("../data/mtsamples.csv")
print(len(df))
df.head()

Because of the long tail, going to focus on top specialties only, say with >100 samples. Note there is a kind of inconsistent semantics here with visit types mixed in, we'll take a look at that in error analysis.

In [None]:
top_specialties = df.medical_specialty.value_counts()
top_specialties = top_specialties[top_specialties > 100]
top_specialties



In [None]:
classes = list(top_specialties.keys())
prev_count = len(df)
df = df.loc[df.medical_specialty.isin(classes)]
print(f"Went from {prev_count} samples to {len(df)} samples")

class_dict = {c:i for i, c in enumerate(classes)}

def one_hot_encode(specialty):
    y = np.zeros(len(classes), dtype=int)
    y[class_dict[specialty]] = 1
    return y

df["target"] = df.medical_specialty.apply(one_hot_encode)
df.head()

In [None]:
# train-val-test
# df["split"] = random.choices(
#     ["train", "val", "test"],
#     weights=[0.7, 0.15, 0.15],
#     k=len(df)
# )

# Create indices for train-test split
indices = np.arange(len(df))
np.random.shuffle(indices)  # Shuffle the indices

# Calculate split point (85% train, 15% test)
split_idx = int(0.85 * len(indices))
train_indices = indices[:split_idx]
test_indices = indices[split_idx:]

len(train_indices), len(test_indices)


# Pre-processing
- lower case
- remove punctuation and symbols
- remove stopwords

# Featurization
- BoW
- TF-IDF
- Embedding layer
- Pre-trained embedding

In [None]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    
    # Make sure stopwords are downloaded
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

df['processed_transcription'] = df['transcription'].apply(preprocess_text)

# Display a sample to verify the preprocessing
print("Original vs Processed example:")
sample_idx = 0
print("Original:", df['transcription'].iloc[sample_idx][:100], "...")
print("Processed:", df['processed_transcription'].iloc[sample_idx][:100], "...")
df.head()

In [None]:
vocab = {}
for _, row in df.iterrows():
    x = row["processed_transcription"]
    for word in x.split():
        vocab[word] = vocab.get(word, 0) + 1
len(vocab)

In [None]:
# Convert vocab to a pandas Series for easier manipulation
word_counts = pd.Series(vocab)

# Sort by frequency (descending)
word_counts = word_counts.sort_values(ascending=False)

# Plot the distribution of the top 50 words
plt.figure(figsize=(12, 6))
word_counts.head(50).plot(kind='bar')
plt.title('Top 50 Words by Frequency')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Plot the overall distribution
plt.figure(figsize=(12, 6))
plt.plot(range(len(word_counts)), word_counts.values)
plt.title('Word Frequency Distribution')
plt.xlabel('Word Rank')
plt.ylabel('Frequency')
plt.xscale('log')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Display the top 20 words
word_counts.head(20)

In [None]:
k = 10  
vocab_filtered = [word for word in vocab if vocab[word] > k]  # same order of magnitude as n
p = len(vocab_filtered)
p

In [None]:
n = len(df)

X = np.zeros((n,p))
y = np.zeros((n,len(classes)))

from collections import Counter
def get_word_counts(s):
    freqs = Counter()
    for word in s.split():
        freqs[word] += 1
    return freqs

for i, row in df.iterrows():
    doc = row.processed_transcription
    freqs = get_word_counts(doc)
    x_bow = np.zeros(p)
    for j in range(p):
        word = vocab_filtered[j]
        x_bow[j] = freqs[word]
    X[i:] = x_bow
    y[i:] = row.target
X, y

# def get_bow_vector_binary(document):
#     return [1 if word in document else 0 for word in vocab_filtered]

# df["bow_binary"] = df.processed_transcription.apply(get_bow_vector_binary)


In [None]:
# Calculate sparsity of X
# Sparsity is the percentage of zero elements in the matrix
sparsity = np.count_nonzero(X == 0) / X.size
print(f"Sparsity of X: {sparsity:.4f} ({sparsity*100:.2f}%)")

# Alternatively, we can calculate density (non-zero elements)
density = 1 - sparsity
print(f"Density of X: {density:.4f} ({density*100:.2f}%)")


In [None]:
X[train_indices], y[train_indices]

In [127]:
import torch
from torch.nn import Linear

class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = Linear(input_dim, output_dim)
        # self.softmax = torch.nn.Softmax(dim=1)  # don't need to use softmax in forward if using crossentropyloss
    
    def forward(self, x):
        return self.linear(x)
    
class MLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden = Linear(input_dim, hidden_dim)
        self.linear = Linear(hidden_dim, output_dim)
        # self.softmax = torch.nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.hidden(x)
        x = torch.nn.functional.relu(x)
        return x self.linear(x)
        # return self.softmax(x)

In [141]:
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train.argmax(axis=1))
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test.argmax(axis=1))

In [None]:
_, true_labels = torch.max(y_test_tensor, dim=1)

In [151]:
def train_and_evaluate(model_type, hidden_dim_factor, learning_rate, weight_decay, batch_size, iterations):
    p = X_train.shape[1]

    if model_type == "logistic":
        model = LogisticRegression(p, len(classes))
    else:
        hidden_dim = int(p * hidden_dim_factor)
        model = MLP(p, hidden_dim, len(classes))
    loss_function = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    X_train_tensor = torch.FloatTensor(X[train_indices])
    y_train_tensor = torch.LongTensor(y[train_indices].argmax(axis=1))
    train_losses, val_losses = [], []

    for _ in range(iterations):
        # Sample a random batch
        indices = torch.randperm(len(X_train_tensor))[:batch_size]
        X_batch = X_train_tensor[indices]
        y_batch = y_train_tensor[indices]
        
        # Forward model on batch
        preds = model(X_batch)

        # Calculate loss
        loss = loss_function(preds, y_batch)

        # Backpropagate
        optimizer.zero_grad()
        loss.backward()

        # Take step
        optimizer.step()
        train_losses.append(loss.item())
        
        # Create a validation set (using a different random subset)
        val_indices = torch.randperm(len(X_train_tensor))[:batch_size*2]
        X_val = X_train_tensor[val_indices]
        y_val = y_train_tensor[val_indices]
        
        with torch.no_grad():
            val_preds = model(X_val)
            val_loss = loss_function(val_preds, y_val).item()
            val_losses.append(val_loss)

    # Evaluate on test set
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        
    _, predicted = torch.max(y_pred, dim=1)
    # _, true_labels = torch.max(y_test_tensor, dim=1)
    accuracy = (predicted == y_test_tensor).float().mean().item()

    return {
        'model_type': model_type,
        'hidden_dim_factor': hidden_dim_factor,
        'learning_rate': learning_rate,
        'weight_decay': weight_decay,
        'batch_size': batch_size,
        'iterations': iterations,
        'final_train_loss': train_losses[-1],
        'final_val_loss': val_losses[-1] if val_losses else None,
        'test_accuracy': accuracy
        }


In [157]:
# Hyperparameter grid
param_grid = {
    'model_type': ['logistic', 'mlp'],
    'hidden_dim_factor': [0.1, 0.5, 1.0, 2.0],  # as a fraction of input_dim
    'learning_rate': [0.001, 0.01, 0.1],
    'weight_decay': [0.0, 0.001, 0.01, 0.1],
    'batch_size': [64, 128, 256],
    'iterations': [500, 1000]  # Reduced for faster execution
}

results = []

def run_hyperparameter_sweep(param_grid, num_runs=5):
    from itertools import product
    # Generate parameter combinations to test
    # For a full grid search, uncomment the following:
    # param_combinations = list(product(*param_grid.values()))
    
    # For a random subset to save time:
    all_combinations = list(product(*param_grid.values()))
    np.random.shuffle(all_combinations)
    param_combinations = all_combinations[:num_runs]
    
    for i, params in enumerate(param_combinations):
        print(f"Run {i+1}/{len(param_combinations)}: {dict(zip(param_grid.keys(), params))}")
        
        # Unpack parameters
        model_type, hidden_dim_factor, learning_rate, weight_decay, batch_size, iterations = params
        
        # Train and evaluate model
        result = train_and_evaluate(
            model_type, hidden_dim_factor, learning_rate, weight_decay, batch_size, iterations
        )
        
        results.append(result)
    
    # Convert results to DataFrame for easier analysis
    results_df = pd.DataFrame(results)
    return results_df

In [None]:
results_df = run_hyperparameter_sweep(param_grid, num_runs=10)
# Display top 5 models by test accuracy
results_df.sort_values('test_accuracy', ascending=False).head(5)

In [None]:
results[np.argmax([r["test_accuracy"] for r in results])]