In [None]:
import os
import pandas as pd 
import numpy as np
import torch
import subprocess
import torch.nn.functional as F
from tqdm import tqdm
nvidia_smi_output = subprocess.check_output(['nvidia-smi']).decode('utf-8')
print(nvidia_smi_output)


In [4]:
data_dir = '../data/'
emb_dir = os.path.join(data_dir, 'embedding/emb_RGCN')

n_file = int(len(os.listdir(emb_dir))/5)

device = torch.device('cuda:0')

In [6]:
adr_node = pd.read_csv(os.path.join(data_dir, 'adr_node.csv'), sep= ',')
dp_node = pd.read_csv(os.path.join(data_dir, 'dp_node.csv'), sep= ',')
gene_node = pd.read_csv(os.path.join(data_dir, 'gene_node.csv'), sep= ',')
disease_symptom = pd.read_csv(os.path.join(data_dir,'disease_symptom.csv'), sep= ',')


In [7]:
def cosine_sim(emb1, emb2, device):

    # emb1: a dataframe (n1 x m)
    # emb2: a dataframe (n2 x m)

    tensor1 = torch.tensor(emb1.values).to(device)
    tensor2 = torch.tensor(emb2.values).to(device)

    norm1 = tensor1.norm(p=2, dim=1, keepdim=True)       
    norm1 = tensor1 / norm1.clamp(min=1e-6)

    norm2 = tensor2.norm(p=2, dim=1, keepdim=True)
    norm2 = tensor2 / norm2.clamp(min=1e-6)

    cosine_sim = torch.mm(norm1, norm2.T)
    
    return cosine_sim

In [None]:
rank_sum_adr_gene = torch.zeros([adr_node.shape[0], gene_node.shape[0]]).to(device)
rank_sum_dp_gene = torch.zeros([dp_node.shape[0], gene_node.shape[0]]).to(device)

for i in tqdm(range(n_file)):

    # Read embeding for repeat i
    embedding_adr = pd.read_csv(os.path.join(emb_dir, 'emb_adr' + '_rep'+str(i+1)+'.csv'), header = None) 
    embedding_gene = pd.read_csv(os.path.join(emb_dir, 'emb_gene' + '_rep'+str(i+1)+'.csv'), header = None)
    embedding_dp = pd.read_csv(os.path.join(emb_dir, 'emb_dp' + '_rep'+str(i+1)+'.csv'), header = None)


    embedding_adr.index = adr_node.iloc[:, 0]
    embedding_dp.index = dp_node.iloc[:, 0]
    embedding_gene.index = gene_node.iloc[:, 0]
    
    # Compute cosine similarity
    adr_gene = cosine_sim(embedding_adr, embedding_gene, device = device)
    dp_gene = cosine_sim(embedding_dp, embedding_gene, device = device)
    adr_dp = cosine_sim(embedding_adr, embedding_dp, device = device)

    # rank
    rank_adr_gene = (-adr_gene).argsort(axis=1).argsort(axis=1) + 1
    rank_dp_gene = (-dp_gene).argsort(axis=1).argsort(axis=1) + 1

    # rank Sum
    rank_sum_adr_gene = rank_sum_adr_gene + rank_adr_gene
    rank_sum_dp_gene = rank_sum_dp_gene + rank_dp_gene

rank_mean_adr_gene = rank_sum_adr_gene / n_file 
rank_mean_dp_gene = rank_sum_dp_gene / n_file 

In [17]:
rank_mean_adr_gene_df = rank_mean_adr_gene.cpu().numpy()
rank_mean_adr_gene_df = pd.DataFrame(rank_mean_adr_gene_df, 
                          index=embedding_adr.index, 
                          columns=embedding_gene.index)
rank_mean_adr_gene_df.to_csv(os.path.join(result_dir, 'adr_gene.csv'), sep=',')

In [18]:
rank_mean_dp_gene_df = rank_mean_dp_gene.cpu().numpy()
rank_mean_dp_gene_df = pd.DataFrame(rank_mean_dp_gene_df, 
                          index=embedding_dp.index, 
                          columns=embedding_gene.index)
rank_mean_dp_gene_df.to_csv(os.path.join(result_dir, 'dp_gene.csv'), sep=',')

In [None]:
rank_mean_adr_dp_df = rank_mean_adr_dp.cpu().numpy()
rank_mean_adr_dp_df = pd.DataFrame(rank_mean_adr_dp_df, 
                          index=embedding_adr.index, 
                          columns=embedding_dp.index)
rank_mean_adr_dp_df.to_csv(os.path.join(result_dir, 'adr_dp.csv'), sep=',')