In [37]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random
import scanpy as sc
import seaborn as sns
import sklearn
import umap.umap_ as umap

from pathlib import Path
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

np.random.seed(1)
random.seed(1)

In [28]:
import util
import importlib
importlib.reload(util)

N_genes, N_dims, gpt_embeddings = util.retrieve_gpt_gene_embeddings()
print(f"(N_genes, N_dims) = {N_genes}, {N_dims}") 

(N_genes, N_dims) = 93800, 1536


In [29]:
sampled_aorta_data = sc.read_h5ad(util.data_dir / "sample_aorta_data_updated.h5ad")
N_cells = sampled_aorta_data.n_obs
N_genes = sampled_aorta_data.n_vars
print(f"(N_genes, N_cells) = {N_genes}, {N_cells}")

(N_genes, N_cells) = 23331, 9625


  utils.warn_names_duplicates("obs")


In [30]:
sampled_aorta_data_gpt_embeddings = util.generate_w_gpt_embeddings(N_dims, sampled_aorta_data, gpt_embeddings)
sampled_aorta_data_gpt_embeddings.shape



(9625, 1536)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(sampled_aorta_data_gpt_embeddings, 
                                                    sampled_aorta_data.obs.phenotype,
                                                    test_size=0.20, random_state=2023)

# Verify the split
print(f"Training set size (X_train): {len(X_train)}")
print(f"Test set size (X_test): {len(X_test)}")

Training set size (X_train): 7700
Test set size (X_test): 1925


In [35]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(np.mean(y_test== y_pred_lr))
print(sklearn.metrics.precision_recall_fscore_support(y_test, y_pred_lr, average='weighted'))

0.5548051948051949
(0.43725785723992, 0.5548051948051949, 0.48580424271091543, None)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
kmeans = MiniBatchKMeans(n_clusters=11, random_state=2023, batch_size=20)
kmeans.fit(sklearn.decomposition.PCA(n_components=50).fit_transform(X=sampled_aorta_data_gpt_embeddings))
annotation_label = sampled_aorta_data.obs.patient

labels = kmeans.labels_
print('aRI', sklearn.metrics.adjusted_rand_score(labels, annotation_label))
print('aMI', sklearn.metrics.adjusted_mutual_info_score(labels, annotation_label))

aRI 0.17650274883496175
aMI 0.23870629577826807


In [43]:
kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=11, random_state=2023, batch_size=20)
kmeans.fit(sampled_aorta_data_gpt_embeddings[np.where(sampled_aorta_data.obs.celltype!='Unknown')[0]])
annotation_label = sampled_aorta_data.obs.celltype[np.where(sampled_aorta_data.obs.celltype!='Unknown')[0]]

labels = kmeans.labels_
print('aRI', sklearn.metrics.adjusted_rand_score(labels, annotation_label))
print('aMI', sklearn.metrics.adjusted_mutual_info_score(labels, annotation_label))

aRI 0.47623302555857844
aMI 0.5504476114142288


  annotation_label = sampled_aorta_data.obs.celltype[np.where(sampled_aorta_data.obs.celltype!='Unknown')[0]]
