In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from k_means_constrained import KMeansConstrained
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import re
import os

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to get embedding for a single text
def get_single_text_embedding(text, tokenizer, model, device):
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    inputs = tokenizer(cleaned_text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(axis=1).cpu().numpy()


# Function to calculate cosine similarity
def calculate_cosine_similarity(embedding, fix_embedding):
    return cosine_similarity([embedding], [fix_embedding])[0][0]

In [None]:
# Load the BlueBERT tokenizer and model
model_name = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [None]:
# Step 1: Load the Cleaned DataFrame and Ensure Correct Data Types

import pandas as pd
from pymongo import MongoClient

# Load the cleaned DataFrame
df_loaded = pd.read_csv('df_cc-pram_icd-title_mapped_cleaned.csv')

# Ensure subject_id and stay_id are strings
df_loaded['subject_id'] = df_loaded['subject_id'].astype(str)
df_loaded['stay_id'] = df_loaded['stay_id'].astype(str)

# Extract relevant IDs for fetching data from MongoDB
subject_ids = df_loaded['subject_id'].unique().tolist()
stay_ids = df_loaded['stay_id'].unique().tolist()

if 'Unnamed: 0' in df_loaded.columns:
    df_loaded = df_loaded.drop(columns=['Unnamed: 0'])

print("Loaded DataFrame with shape:", df_loaded.shape)


In [None]:
df_loaded

In [None]:
# Step 2: Fetch Corresponding Data from MongoDB

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')  # Adjust the connection string as necessary
db = client['MIMIC-IV']  # Replace with your database name
collection = db['ED-VitalSigns']  # Replace with your collection name

# Fetch the data from MongoDB
query = {"subject_id": {"$in": subject_ids}, "stay_id": {"$in": stay_ids}}
projection = {"_id": 0}  # Exclude only the _id field, include all others

data = list(collection.find(query, projection))
df_vitalsigns = pd.DataFrame(data)

print("Fetched data from MongoDB with shape:", df_vitalsigns.shape)


In [None]:
df_vitalsigns

In [None]:
df_vitalsigns_cleaned

In [None]:
# Dropping rows with NaN values in specific columns
columns_to_check = ['charttime', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']
df_vitalsigns_cleaned = df_vitalsigns.dropna(subset=columns_to_check)

In [None]:
print("Fetched data from MongoDB with shape:", df_vitalsigns_cleaned.shape)

In [None]:
df_vitalsigns_cleaned.to_csv('df_cc-pram_vitalsigns_cleaned.csv', index=True)

# Clustering for PRAM



In [None]:
# Remove duplicates from df based name
df_sample = df_pyxis.drop_duplicates(subset=['name'])

# Check the sample size after removing duplicates
sample_size = len(df_sample)
sample_size

In [None]:
# Generate embeddings for the sample
df_sample = df_sample.copy()  # Create a proper copy to avoid SettingWithCopyWarning
df_sample.loc[:, 'name_embedding'] = df_sample['name'].apply(get_single_text_embedding, args=(tokenizer, model, torch.device("cuda" if torch.cuda.is_available() else "cpu")))

# Stack the embeddings into a 2D array for clustering
embeddings = np.vstack(df_sample['name_embedding'].values)

In [None]:
# Apply PCA for dimensionality reduction and plot explained variance
pca_inspect = PCA(n_components=100)
reduced_embeddings_inspect = pca_inspect.fit_transform(embeddings)

# Explained variance
explained_variance = np.cumsum(pca_inspect.explained_variance_ratio_)
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid()
plt.savefig('pca_explained_variance.png')
plt.show()

In [None]:
# Choose the number of components based on explained variance
n_components = 45  # Adjust this based on the explained variance plot (e.g. which n makes out 80% of variance)
pca = PCA(n_components=n_components)
reduced_embeddings = pca.fit_transform(embeddings)
df_sample['reduced_embeddings'] = list(reduced_embeddings)

In [None]:
# Normalize the PCA-reduced embeddings using L2 norm
# L2 normalization scales each vector to have a unit norm (length of 1). This is essential because it ensures that
# the distance metric (cosine similarity) focuses on the direction of the vectors rather than their magnitudes.
# This step makes the Euclidean distance between normalized vectors equivalent to cosine similarity, which is 
# crucial for clustering methods like k-means that rely on distance measures.
normalized_embeddings = normalize(reduced_embeddings, norm='l2')


In [None]:
# THIS SEEEMS TO WORK
df_sample.head(10)


In [None]:

print('Pick best_k within range of since "The product of size_max and n_clusters must be larger than or equal the number of samples (X):"')
print(sample_size)
print(sample_size/10)
print(sample_size/5)

In [None]:
# Define the number of clusters and the constraints

size_min = 5  
size_max = 10 
best_k = 50

# Perform K-means constrained clustering
kmeans_constrained = KMeansConstrained(
    n_clusters=best_k,
    size_min=size_min,
    size_max=size_max,
    init="k-means++",
    n_init=200,
    max_iter=1000,
    random_state=1984
)
cosine_clusters = kmeans_constrained.fit_predict(normalized_embeddings)

df_sample['cosine_cluster_name'] = cosine_clusters

In [None]:
# Calculate the cosine distance of each point to its respective cluster center and add the values for a distinct marker within cluster, which can be used in PRAM (Post Randomization)
cosine_distances = np.zeros(normalized_embeddings.shape[0])
for i in range(normalized_embeddings.shape[0]):
    cluster_center = kmeans_constrained.cluster_centers_[cosine_clusters[i]]
    cosine_similarity_value = np.dot(normalized_embeddings[i], cluster_center)
    cosine_distances[i] = 1 - cosine_similarity_value

df_sample['cosine_distance_to_center_name'] = cosine_distances

In [None]:
df_sample.head(1)

In [None]:
df_pyxis.head(1)

In [None]:
# Create mappings from df_sample for name clustering
cluster_mapping = df_sample.set_index('name')['cosine_cluster_name'].to_dict()
distance_mapping = df_sample.set_index('name')['cosine_distance_to_center_name'].to_dict()

# Map the cluster assignments and distances back to the original DataFrame
df_mapped = df_pyxis.copy()
df_mapped['cosine_cluster_name'] = df_pyxis['name'].map(cluster_mapping)
df_mapped['cosine_distance_to_center_name'] = df_pyxis['name'].map(distance_mapping)

In [None]:
df_mapped.head(5)

In [None]:
import numpy as np
import pandas as pd

# Calculate mean and standard deviation for each cluster
cluster_stats = df_mapped.groupby('cosine_cluster_name')['cosine_distance_to_center_name'].agg(['mean', 'std'])

# Define a function to identify high outliers
def is_high_outlier(row):
    cluster = row['cosine_cluster_name']
    distance = row['cosine_distance_to_center_name']
    mean = cluster_stats.loc[cluster, 'mean']
    std = cluster_stats.loc[cluster, 'std']
    return distance > (mean + 3 * std)

# Check if a cluster is valid based on unique counts, since we aim to use PRAM with a 3x3 matrix at least.
def is_valid_cluster(cluster, df, min_unique_names=3, min_unique_distances=3):
    names_count = df[df['cosine_cluster_name'] == cluster]['name'].nunique()
    distances_count = df[df['cosine_cluster_name'] == cluster]['cosine_distance_to_center_name'].nunique()
    return names_count >= min_unique_names and distances_count >= min_unique_distances

# Identify valid clusters
valid_clusters = [cluster for cluster in df_mapped['cosine_cluster_name'].unique() if is_valid_cluster(cluster, df_mapped)]

# Filter df_mapped to include only valid clusters
df_valid = df_mapped[df_mapped['cosine_cluster_name'].isin(valid_clusters)].copy()

# Apply the outlier detection to valid clusters
df_valid['is_high_outlier'] = df_valid.apply(is_high_outlier, axis=1)

# Determine the highest value within each cluster for these outliers
highest_outliers = df_valid[df_valid['is_high_outlier']].groupby('cosine_cluster_name')['cosine_distance_to_center_name'].transform('max')

# Add 'to_be_removed' column to mark the highest outliers
df_valid['to_be_removed'] = df_valid.apply(
    lambda row: row['is_high_outlier'] and row['cosine_distance_to_center_name'] == highest_outliers[row.name],
    axis=1
)

# Create a cleaned DataFrame without the specific highest outliers
df_mapped_cleaned = df_valid[~df_valid['to_be_removed']].copy()

# Track affected clusters
affected_clusters = df_valid.loc[df_valid['to_be_removed'], 'cosine_cluster_name'].unique()

# Drop the extra columns used for outlier detection
df_mapped_cleaned.drop(['is_high_outlier', 'to_be_removed'], axis=1, inplace=True)

# Print affected clusters
print("Affected clusters with highest outlier removed:", affected_clusters)

# Verify that all clusters in df_mapped_cleaned have at least 3 unique `name` and `cosine_distance_to_center_name` values
cluster_validity_cleaned = df_mapped_cleaned.groupby('cosine_cluster_name').agg({
    'name': 'nunique',
    'cosine_distance_to_center_name': 'nunique'
})

invalid_clusters = cluster_validity_cleaned[(cluster_validity_cleaned['name'] < 3) | (cluster_validity_cleaned['cosine_distance_to_center_name'] < 3)].index

if len(invalid_clusters) == 0:
    print("\nAll clusters in df_mapped_cleaned have at least 3 unique `name` and `cosine_distance_to_center_name` values.")
else:
    print("\nClusters violating the rule (less than 3 uniques):", list(invalid_clusters))

# Print change in size before and after cleaning
print(f"\nSize before cleaning: {df_loaded.shape[0]}")
print(f"Size after cleaning and mapping: {df_mapped_cleaned.shape[0]}")


In [None]:
cluster = 14
df_cluster_sorted = df_mapped_cleaned[df_mapped_cleaned['cosine_cluster_name'] == cluster].sort_values(by='charttime', ascending=False).drop_duplicates(subset='name')
df_cluster_sorted

In [None]:
df_pyxis.to_csv('df_cc-pram_pyxisname-original_triage_sample.csv', index=True)
df_mapped_cleaned.to_csv('df_cc-pram_pyxisname_mapped_cleaned.csv', index=True)

In [None]:
[29 59 17 24]

# Inspection

In [None]:
cluster = 60
df_mapped[df_mapped['cosine_cluster_name'] == cluster].sort_values(by='cosine_distance_to_center_name', ascending=False)

In [None]:
cluster = cluster
df_mapped_cleaned[df_mapped_cleaned['cosine_cluster_name'] == cluster].sort_values(by='cosine_distance_to_center_name', ascending=False)