In [None]:
import pandas as pd
import numpy as np
from ChromaVDB.chroma import ChromaFramework
from DeepGraphDB import DeepGraphDB
from tqdm.notebook import tqdm
import torch

gdb = DeepGraphDB()
gdb.load_graph("/home/cc/PHD/dglframework/DeepKG/DeepGraphDB/graphs/primekg.bin")

vdb = ChromaFramework(persist_directory="./ChromaVDB/chroma_db")
records = vdb.list_records()

names = [record['name'] for record in records]
embs = [record['embeddings'] for record in records]
ids = [record['id'] for record in records]

data = pd.read_excel('data/2025_03_29.xlsx') # Provare ad usare anche stadio-avanzato, IPI e Log10hGE

In [None]:
# gene_set = "LNF"
gene_set = "plasma"
# gene_measure = "MUT"
gene_measure = "VAF"

gene_data = data[[col for col in data.columns if gene_set in col and gene_measure in col]]

In [None]:
genes = list(set([ gene.split('_')[0] for gene in gene_data.columns  ]))

final_columns = []
embeddings = []
record_ids = []

for gene in genes:
    if gene in names:
        final_columns.append(gene+"_"+gene_set+"_"+gene_measure)
        embeddings.append(embs[names.index(gene)])
        record_ids.append(ids[names.index(gene)])
    else:
        print(gene)

print(len(final_columns))

gene_data = gene_data[final_columns]
gene_data['pfs'] = data['PFS_Cens_updated']
# gene_data = gene_data.dropna()
gene_data = gene_data.fillna(0)

In [None]:
def encode_patient(vdb, gdb, record_ids, gene_data):
    label = int(gene_data['pfs'])
    gene_data = gene_data.drop(labels=['pfs'])

    inv = dict(zip(vdb.global_to_vids_mapping.values(), vdb.global_to_vids_mapping.keys()))
    g_ids = [ inv[id] for id in record_ids ]

    ctypes = [ ctype for ctype in  gdb.graph.canonical_etypes if ctype[0] == "geneprotein" and ctype[2] == "geneprotein" ]

    # sb = gdb.extract_subgraph(np.array(g_ids)[gene_data.loc[0].values > 0], 2, max_neighbors_per_hop=[50, 25])
    sb = gdb.get_k_hop_neighbors(np.array(g_ids)[gene_data.values > 0], 1, edge_types=ctypes)

    flat_ids = np.array([item for sublist in sb.values() for item in sublist])

    start_ids = np.setdiff1d(flat_ids, np.array(g_ids)[gene_data.values <= 0])
    print(f"Genes included in graph: {start_ids.shape}")

    ctypes_2 = [ ctype for ctype in  gdb.graph.canonical_etypes if ctype[0] == "geneprotein" or ctype[2] == "geneprotein" ]
    ctypes_2.remove(('geneprotein', 'protein_protein', 'geneprotein'))

    sb_2 = gdb.get_k_hop_neighbors(start_ids, 1, edge_types=ctypes_2)
    flat_ids_final = np.array([item for sublist in sb_2.values() for item in sublist])
    print(f"Total nodes to embed: {flat_ids_final.shape}")

    ids_to_search = [ vdb.global_to_vids_mapping[id] for id in flat_ids_final ]
    retrived_records = vdb.read_record(ids_to_search, include_embeddings=True)

    return torch.tensor(np.array([ record['embeddings']['graph'] for record in retrived_records ])).mean(dim=0), label

In [None]:
patient_embs = []
labels = []

for i, row in tqdm(gene_data.iterrows()):
    print(f"--- Patient {i} ---")
    p_emb, label = encode_patient(vdb, gdb, record_ids, gene_data.loc[i])
    
    if not torch.any(p_emb.isnan()):
        patient_embs.append(p_emb)
        labels.append(label)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import torch

def visualize_embeddings_tsne(embeddings, labels, perplexity=30, n_iter=1000, random_state=42):
    """
    Visualize tensor embeddings using t-SNE with binary labels
    
    Args:
        embeddings: List of tensors or numpy arrays, or a single tensor/array
        labels: List of binary labels (0s and 1s)
        perplexity: t-SNE perplexity parameter (default: 30)
        n_iter: Number of iterations for t-SNE (default: 1000)
        random_state: Random state for reproducibility (default: 42)
    """
    
    # Convert tensors to numpy if needed
    if isinstance(embeddings, list):
        if torch.is_tensor(embeddings[0]):
            # Convert list of tensors to numpy array
            embs_np = torch.stack(embeddings).detach().cpu().numpy()
        else:
            # Assume list of numpy arrays
            embs_np = np.array(embeddings)
    elif torch.is_tensor(embeddings):
        # Single tensor
        embs_np = embeddings.detach().cpu().numpy()
    else:
        # Assume numpy array
        embs_np = embeddings
    
    # Reshape if needed (flatten each embedding)
    if len(embs_np.shape) > 2:
        embs_np = embs_np.reshape(embs_np.shape[0], -1)
    
    # Convert labels to numpy array
    labels_np = np.array(labels)
    
    print(f"Embedding shape: {embs_np.shape}")
    print(f"Labels shape: {labels_np.shape}")
    print(f"Unique labels: {np.unique(labels_np)}")
    
    # Apply t-SNE
    print("Applying t-SNE...")
    tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, n_iter=n_iter, random_state=random_state)
    # tsne = PCA(n_components=2)
    embeddings_2d = tsne.fit_transform(embs_np)
    
    # Create the plot
    plt.figure(figsize=(10, 8))
    
    # Plot points with different colors for different labels
    colors = ['red', 'blue']
    labels_text = ['Label 0', 'Label 1']
    
    for i, label in enumerate([0, 1]):
        mask = labels_np == label
        if np.any(mask):  # Only plot if this label exists
            plt.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], 
                       c=colors[i], label=labels_text[i], alpha=0.7, s=50)
    
    plt.title('t-SNE Visualization of Embeddings', fontsize=16)
    plt.xlabel('t-SNE Component 1', fontsize=12)
    plt.ylabel('t-SNE Component 2', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    # Show the plot
    plt.show()
    
    return embeddings_2d, tsne

In [None]:
embeddings_2d, tsne_model = visualize_embeddings_tsne(patient_embs, labels, perplexity=25)

print(f"Final 2D embeddings shape: {embeddings_2d.shape}")