In [2]:
import os
import numpy as np
import pandas as pd
import torch
import h5py
from sklearn.metrics.pairwise import cosine_similarity, paired_distances
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
class EmbeddingGraph:
    def __init__(self, nodes, embedding_matrix):
        super(EmbeddingGraph).__init__()
        self.nodes = np.array(nodes, dtype=object)
        self.nodes_num = len(nodes)
        self.embedding_matrix = embedding_matrix
        self.adj_matrix = np.zeros(shape = (self.nodes_num, self.nodes_num), dtype=float)
        self.distance_matrix = np.zeros_like(self.adj_matrix)
        self.degrees= self.adj_matrix.sum(1)
    
    def preprocess(self):
        estimator = PCA(n_components=32)
        pca = estimator.fit_transform(self.embedding_matrix)
        self.embedding_matrix = pca
        scaler = StandardScaler().fit(self.embedding_matrix)
        self.embedding_matrix = scaler.transform(self.embedding_matrix)
        return self.embedding_matrix

    def cal_eucli_distance(self, node_idx1, node_idx2):
        '''calculate the euclidean distance between two given nodes' index'''
        embedding1 = self.embedding_matrix[node_idx1]
        embedding2 = self.embedding_matrix[node_idx2]
        eucli_dist = np.sqrt(sum((embedding1 - embedding2) ** 2))
        return eucli_dist

    def cal_cosine_distance(self, node_idx1, node_idx2):
        embedding1 = self.embedding_matrix[node_idx1].reshape(1, -1)
        embedding2 = self.embedding_matrix[node_idx2].reshape(1, -1)
        cosine_dist = paired_distances(embedding1, embedding2, metric='cosine')
        return cosine_dist

    def create_distance_matrix(self, mode='cosine'):
        '''create distance matrix according to specified standard'''
        assert mode in ['eculidean', 'cosine']
        for i in range(self.nodes_num):
            for j in range(i, self.nodes_num):
                if mode == 'eculidean':
                    dist = self.cal_eucli_distance(i, j)
                elif mode == 'cosine':
                    dist = self.cal_cosine_distance(i, j)
                self.distance_matrix[i, j] = dist
                self.distance_matrix[j, i] = dist
        return self.distance_matrix

    def knn_create_adj_matrix(self, k):
        '''create adj matrix according to k nearest neighbours'''
        assert self.distance_matrix.sum() != 0   # need to create distance matrix first, call class.create_distance_matrix()
        self.adj_matrix = np.zeros(shape = (self.nodes_num, self.nodes_num), dtype=float) 
        for i in range(self.nodes_num):
            min_k_indices = self.distance_matrix[i].argpartition(k)[:k]
            for j in min_k_indices:
                self.adj_matrix[i][j] = 1
                self.adj_matrix[j][i] = 1
        self.degrees = self.adj_matrix.sum(1)
        return self.adj_matrix
    
    def embedding_out(self, name, mode='csv'):
        assert mode in ['csv', 'tsv']    # output the embedding matrix as a csv file or tsv file
        if mode == 'tsv':
            np.savetxt(name, self.embedding_matrix, delimiter='\t')
    
    def norm_adj_matrix(self):
        '''return lapalacian-normed adj matrix'''
        assert self.distance_matrix.sum != 0 # need to create distance matrix first, call class.create_distance_matrix() 
        assert self.adj_matrix.sum() != 0 # need to create adj matrix first, call class.create_adj_matrix() 
        degree = np.array(self.adj_matrix.sum(1))
        self.degrees = degree
        degree = np.diag(np.power(degree, -0.5))
        return degree.dot(self.adj_matrix).dot(degree)


    def save_h5_file(self, name): 
        if os.path.exists(name):   # replace the old h5 file
            os.remove(name)
        dt_str = h5py.special_dtype(vlen=str)
        f = h5py.File(name, mode='w')
        f.create_dataset('nodes', data=self.nodes, dtype=dt_str)
        f.create_dataset('embedding_matrix', data=self.embedding_matrix, dtype=float)
        f.create_dataset('adj_matrix', data=self.adj_matrix, dtype=float)
        f.create_dataset('distance_matrix', data=self.distance_matrix, dtype=float)
        normed_adj_matrix = self.norm_adj_matrix()
        f.create_dataset('normed_adj_matrix', data=np.array(normed_adj_matrix), dtype=float)
        f.close()

In [6]:
# create adj table for gnn   
patho_gene_embedding_dir = '/amax/data/ruijin/embedding/ruijin_63/splita/'     
    # embedding dir on your own device, need to be checked
split_file_dir = '/home/xieyuzhang/mtmcat/dataset/survival/ruijin_63/ruijin_63_new_splits/incomplete_dataset.csv'
split_file = pd.read_csv(split_file_dir)
train_patient_list = list(split_file['train'].dropna())
val_patient_list = list(split_file['validation'].dropna())
test_patient_list = list(split_file['test'].dropna())
case_ids = np.array(train_patient_list + val_patient_list + test_patient_list)
patho_gene_embeddings = []
for case_id in case_ids:
    embedding = np.array(torch.load(os.path.join(patho_gene_embedding_dir, case_id+'.pt')).to('cpu').squeeze(), dtype=float)
    patho_gene_embeddings.append(embedding)
patho_gene_embeddings = np.array(patho_gene_embeddings)
person_graph = EmbeddingGraph(case_ids, patho_gene_embeddings)

In [7]:
person_graph = EmbeddingGraph(case_ids, patho_gene_embeddings)
# person_graph.preprocess()
person_graph.create_distance_matrix(mode='cosine')
person_graph.knn_create_adj_matrix(4)
adj = person_graph.norm_adj_matrix()
person_graph.save_h5_file('/home/xieyuzhang/mtmcat/dataset/survival/ruijin_63/inputs/embedding/incomplete_dataset/incomplete_dataset_person.h5')

In [8]:
person_graph.degrees

array([5., 4., 4., 8., 4., 5., 4., 5., 5., 5., 4., 7., 6., 5., 4., 4., 6.,
       4., 4., 5., 4., 4., 4., 4., 5., 4., 4., 4., 4., 5., 5., 4., 4., 4.,
       6., 9., 6., 6., 4., 6., 5., 5., 5., 5., 4., 4., 4., 5., 8., 5., 5.,
       5., 5., 4., 5., 5., 5., 6., 6., 5., 5., 5., 4.])

In [23]:
person_graph.embedding_matrix.shape

(63, 64)