In [1]:
import pandas as pd
import numpy as np
import random
random.seed(42)

from openai import OpenAI
import os
import time
from tqdm import tqdm

import sys
sys.path.append('../code')

In [2]:
OPENAI_API_KEY = ""
SAMPLE_SIZE = 100

In [None]:
df = pd.read_csv("../data/mtsamples.csv")
print(len(df))
df.columns = [c.strip() for c in df.columns]
df["medical_specialty"] = df["medical_specialty"].map(lambda x: x.strip())

SPECIALTIES = ["Surgery", "Cardiovascular / Pulmonary", "Orthopedic", "Radiology", "General Medicine"]
df = df.loc[df.medical_specialty.isin(SPECIALTIES)]
print(len(df))
# Perform stratified sampling to get 100 samples per specialty
sampled_dfs = []
for specialty in SPECIALTIES:
    specialty_df = df[df.medical_specialty == specialty]
    # If there are fewer than SAMPLE_SIZE samples in a specialty, take all of them
    if len(specialty_df) <= SAMPLE_SIZE:
        sampled_dfs.append(specialty_df)
        print(f"{specialty}: using all {len(specialty_df)} available samples")
    else:
        # Otherwise, sample SAMPLE_SIZE samples
        sampled_dfs.append(specialty_df.sample(n=SAMPLE_SIZE, random_state=42))
        print(f"{specialty}: sampled {SAMPLE_SIZE} from {len(specialty_df)} available samples")

# Combine all sampled dataframes
df = pd.concat(sampled_dfs, ignore_index=True)

# Display the distribution of specialties after sampling
print("\nDistribution after stratified sampling:")
print(df.medical_specialty.value_counts())


In [None]:
client = OpenAI(api_key=OPENAI_API_KEY)

# Function to get embeddings for a text
def get_embedding(text):
    try:
        response = client.embeddings.create(
            input=text,
            model="text-embedding-3-large"
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None

# Create embeddings for each transcription
# Using tqdm for progress tracking and adding rate limiting to avoid API limits
# Create a list to store embeddings with their corresponding indices
embeddings_with_indices = []
for idx, text in tqdm(enumerate(df['transcription'].tolist())):
    embedding = get_embedding(text)
    # Store both the embedding and its original index
    embeddings_with_indices.append((idx, embedding))
    time.sleep(0.1)  # Rate limiting

In [None]:
# Create a new column for embeddings, initialized with None values
df['embedding'] = None

# Only add embeddings that were successfully generated, using their original indices
for idx, embedding in embeddings_with_indices:
    if embedding is not None:
        df.at[idx, 'embedding'] = embedding

# Check if any embeddings failed
failed_embeddings = df['embedding'].isna().sum()
if failed_embeddings > 0:
    print(f"Warning: {failed_embeddings} embeddings failed to generate")

In [11]:
dfe = df.loc[~df.embedding.isna()]
dfe.to_csv("../data/mtsamples_with_embeddings_stratified.csv", index=None)

In [None]:
df = pd.read_csv("../data/mtsamples_with_embeddings_stratified.csv")
# k = 5

# top_specialties = df.medical_specialty.value_counts().head(k).index.values
# df = df.loc[df.medical_specialty.isin(top_specialties)]
# top_specialties

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the embedding strings to numpy arrays
df['embedding'] = df['embedding'].apply(
    lambda x: np.array(eval(x)) if isinstance(x, str) else None
)

# Filter out any rows with missing embeddings
df_filtered = df.dropna(subset=['embedding'])

# Extract embeddings as a list of arrays
embeddings_list = df_filtered['embedding'].tolist()

# Convert list of embeddings to a 2D numpy array
embeddings_array = np.vstack(embeddings_list)

In [None]:
# Apply t-SNE to reduce dimensions to 2D
print("Applying t-SNE dimensionality reduction...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=500)
embeddings_2d = tsne.fit_transform(embeddings_array)

# Create a DataFrame with the 2D coordinates and medical specialty
tsne_df = pd.DataFrame({
    'x': embeddings_2d[:, 0],
    'y': embeddings_2d[:, 1],
    'medical_specialty': df_filtered['medical_specialty'].values
})

# Plot the t-SNE visualization
plt.figure(figsize=(6, 6))
sns.scatterplot(
    data=tsne_df,
    x='x',
    y='y',
    hue='medical_specialty',
    # palette='bright',
    alpha=0.7,
    s=100
)

# plt.title('Medical Transcriptions (text-embedded-3-large, t-SNE)', fontsize=16)
# plt.xlabel('Dimension 1', fontsize=12)
# plt.ylabel('Dimension 2', fontsize=12)
plt.legend(fontsize=8)
plt.tight_layout()
plt.show()

# Print the number of unique medical specialties
print(f"Number of unique medical specialties: {df_filtered['medical_specialty'].nunique()}")
print(f"Total number of samples visualized: {len(df_filtered)}")


In [None]:
# Apply PCA to reduce dimensions to 2D
print("Applying PCA dimensionality reduction...")
from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=42)
embeddings_2d_pca = pca.fit_transform(embeddings_array)

# Create a DataFrame with the 2D coordinates and medical specialty
pca_df = pd.DataFrame({
    'x': embeddings_2d_pca[:, 0],
    'y': embeddings_2d_pca[:, 1],
    'medical_specialty': df_filtered['medical_specialty'].values
})

# Plot the PCA visualization
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=pca_df,
    x='x',
    y='y',
    hue='medical_specialty',
    palette='Set1',
    alpha=0.7,
    s=100
)

plt.title('Medical Transcriptions (text-embedding-3-large)', fontsize=16)
plt.xlabel('Component 1', fontsize=12)
plt.ylabel('Component 2', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), fontsize=10)
plt.tight_layout()
plt.show()

# Print the explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.2f}")
