In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import openpyxl

In [2]:
# Load embeddings file

#mistral
path_mistral = '/Users/ellenbowen/Desktop/Michigan/Siwo_lab/'
file_mistral = str(path_mistral) + 'phecode_mistral_embedding.pkl'

df_mistral = pd.read_pickle(file_mistral)

# gpt
path_gpt = '/Users/ellenbowen/Desktop/Michigan/Siwo_lab/'
file_gpt = str(path_gpt) + 'phecode_gpt_embedding.pkl'

df_gpt = pd.read_pickle(file_gpt)

# voyage
path_voyage = '/Users/ellenbowen/Desktop/Michigan/Siwo_lab/'
file_voyage = str(path_voyage) + 'phecode_voyage_embedding.pkl'

df_voyage = pd.read_pickle(file_voyage)

In [9]:
df_gpt['category'].unique()

array(['infectious diseases', nan, 'neoplasms', 'endocrine/metabolic',
       'hematopoietic', 'mental disorders', 'neurological',
       'sense organs', 'circulatory system', 'respiratory', 'digestive',
       'genitourinary', 'pregnancy complications', 'dermatologic',
       'musculoskeletal', 'congenital anomalies', 'symptoms',
       'injuries & poisonings'], dtype=object)

In [5]:
df_voyage['category'].unique()

array(['infectious diseases', nan, 'neoplasms', 'endocrine/metabolic',
       'hematopoietic', 'mental disorders', 'neurological',
       'sense organs', 'circulatory system', 'respiratory', 'digestive',
       'genitourinary', 'pregnancy complications', 'dermatologic',
       'musculoskeletal', 'congenital anomalies', 'symptoms',
       'injuries & poisonings'], dtype=object)

# Mistral average embeddings

In [3]:
#make sure embeddings data is an array
df_mistral['embeddings'] = df_mistral['embeddings'].apply(np.array)

# Group by disease category and average the embeddings
category_embeddings_mistral = (
    df_mistral.groupby('category')['embeddings']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index()
)

# Rename columns for clarity
category_embeddings_mistral.columns = ['category', 'avg_embedding']


In [28]:
pickle_file = 'category_avg_embeddings_mistral.pkl'
category_embeddings_mistral.to_pickle(pickle_file)

In [54]:
# category_embeddings_mistral['avg_embedding'] = category_embeddings_mistral['avg_embedding'].apply(lambda x: json.dumps(x.tolist()))
# category_embeddings_mistral.to_csv('category_avg_embeddings_mistral.csv', index=False)


# GPT averaged embeddings

In [4]:
#make sure embeddings data is an array
df_gpt['embeddings'] = df_gpt['embeddings'].apply(np.array)

# Group by disease category and average the embeddings
category_embeddings_gpt = (
    df_gpt.groupby('category')['embeddings']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index()
)

# Rename columns for clarity
category_embeddings_gpt.columns = ['category', 'avg_embedding']

In [30]:
pickle_file = 'category_avg_embeddings_gpt.pkl'
category_embeddings_gpt.to_pickle(pickle_file)

In [8]:
# Convert each embedding array to a long string with high precision
category_embeddings_gpt['avg_embedding'] = category_embeddings_gpt['avg_embedding'].apply(
    lambda x: "[" + ", ".join(f"{v:.10f}" for v in x) + "]"
)

# Export to Excel
category_embeddings_gpt.to_excel("category_avg_embeddings_gpt_full.xlsx", index=False)


In [11]:
category_embeddings_gpt

Unnamed: 0,category,avg_embedding
0,circulatory system,"[-0.02259593942670658, 0.0026998810491593674, ..."
1,congenital anomalies,"[-0.0012482956766949169, 0.01212603372096055, ..."
2,dermatologic,"[-0.003326331262845391, 0.005034657234773476, ..."
3,digestive,"[-0.0033964011176812418, 0.005431490945837535,..."
4,endocrine/metabolic,"[-0.01079635200606998, -0.004346215406408811, ..."
5,genitourinary,"[-0.010222831910442827, -0.003920748559030471,..."
6,hematopoietic,"[-0.017526577019335462, -0.010456552888846941,..."
7,infectious diseases,"[-0.0046266999373641, -0.0050466424931126285, ..."
8,injuries & poisonings,"[-0.004829939856165311, 0.00014071109535507276..."
9,mental disorders,"[-0.012789647945618, 0.006035329067871268, 0.0..."


# Voyage average embeddings

In [57]:
#make sure embeddings data is an array
df_voyage['embeddings'] = df_voyage['embeddings'].apply(np.array)

# Group by disease category and average the embeddings
category_embeddings_voyage = (
    df_voyage.groupby('category')['embeddings']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index()
)

# Rename columns for clarity
category_embeddings_voyage.columns = ['category', 'avg_embedding']

In [32]:
pickle_file = 'category_avg_embeddings_voyage.pkl'
category_embeddings_voyage.to_pickle(pickle_file)

In [58]:
category_embeddings_voyage['avg_embedding'] = category_embeddings_voyage['avg_embedding'].apply(lambda x: json.dumps(x.tolist()))
category_embeddings_voyage.to_csv('category_avg_embeddings_voyage.csv', index=False)
