In [1]:
import subprocess
import os

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from collections import Counter

In [None]:
print(os.environ.get('CONDA_DEFAULT_ENV'))
nvidia_smi_output = subprocess.check_output(['nvidia-smi']).decode('utf-8')
print(nvidia_smi_output)
torch.cuda.is_available()

In [3]:
data_dir = '../data/'
emb_dir = os.path.join(data_dir, 'embedding/emb_GraphConv')

n_file = int(len(os.listdir(emb_dir)))

device = torch.device('cuda:0')

In [5]:
def cosine_sim(emb1, emb2, device):

    # emb1: a dataframe (n1 x m)
    # emb2: a dataframe (n2 x m)

    tensor1 = torch.tensor(emb1.values).to(device)
    tensor2 = torch.tensor(emb2.values).to(device)

    norm1 = tensor1.norm(p=2, dim=1, keepdim=True)        
    norm1 = tensor1 / norm1.clamp(min=1e-6)

    norm2 = tensor2.norm(p=2, dim=1, keepdim=True)
    norm2 = tensor2 / norm2.clamp(min=1e-6)
    cosine_sim = torch.mm(norm1, norm2.T)
    
    return cosine_sim


In [6]:
conv_table = pd.read_csv(os.path.join(data_dir, "conversionTable.csv"))

conv_adr = conv_table[conv_table['Names'].str.contains('meddra.')]
conv_dp = conv_table[conv_table['Names'].str.contains('hpo.')]
conv_gene = conv_table[conv_table['Names'].str.contains('entrez.')]

In [7]:
universal_index_adr = conv_adr['Nodes'].sort_values().to_list()
universal_index_gene = conv_gene['Nodes'].sort_values().to_list()
universal_index_dp = conv_dp['Nodes'].sort_values().to_list()

In [None]:
rank_sum_adr_gene = torch.zeros([conv_adr.shape[0], conv_gene.shape[0]]).to(device)
rank_sum_dp_gene = torch.zeros([conv_dp.shape[0], conv_gene.shape[0]]).to(device)
rank_sum_adr_dp = torch.zeros([conv_adr.shape[0], conv_dp.shape[0]]).to(device)
rank_sum_adr_adr = torch.zeros([conv_adr.shape[0], conv_adr.shape[0]]).to(device)
rank_sum_dp_dp = torch.zeros([conv_dp.shape[0], conv_dp.shape[0]]).to(device)

index_adr_gene = torch.zeros([conv_adr.shape[0], conv_gene.shape[0]]).to(device)
index_dp_gene = torch.zeros([conv_dp.shape[0], conv_gene.shape[0]]).to(device)
index_adr_dp = torch.zeros([conv_adr.shape[0], conv_dp.shape[0]]).to(device)
index_adr_adr = torch.zeros([conv_adr.shape[0], conv_adr.shape[0]]).to(device)
index_dp_dp = torch.zeros([conv_dp.shape[0], conv_dp.shape[0]]).to(device)

for i in tqdm(range(n_file)):

    df = pd.read_csv(os.path.join(emb_dir, str(i+1)+'.csv'), header = None)

    # seperating adrs, dps and genes
    embedding_adr = df.loc[df.index.isin(conv_adr['Nodes'])]
    embedding_dp = df.loc[df.index.isin(conv_dp['Nodes'])]
    embedding_gene = df.loc[df.index.isin(conv_gene['Nodes'])]

    embedding_adr_aligned = embedding_adr.reindex(index=universal_index_adr, fill_value=np.nan)
    I_narows_adr = embedding_adr_aligned.isna().all(axis=1)
    embedding_adr_aligned = embedding_adr_aligned.fillna(0)
    
    embedding_dp_aligned = embedding_dp.reindex(index=universal_index_dp, fill_value=np.nan)
    I_narows_dp = embedding_dp_aligned.isna().all(axis=1)
    embedding_dp_aligned = embedding_dp_aligned.fillna(0)

    embedding_gene_aligned = embedding_gene.reindex(index=universal_index_gene, fill_value=np.nan)
    I_narows_gene = embedding_gene_aligned.isna().all(axis=1)
    embedding_gene_aligned = embedding_gene_aligned.fillna(0)

    # Compute cosine similarity
    adr_gene = cosine_sim(embedding_adr_aligned, embedding_gene_aligned, device = device)
    dp_gene = cosine_sim(embedding_dp_aligned, embedding_gene_aligned, device = device)
    adr_dp = cosine_sim(embedding_adr_aligned, embedding_dp_aligned, device = device)

    # rank
    rank_adr_gene = (-adr_gene).argsort(axis=1).argsort(axis=1) + 1
    rank_dp_gene = (-dp_gene).argsort(axis=1).argsort(axis=1) + 1
    rank_adr_dp = ((-adr_dp).flatten().argsort().argsort() + 1).reshape(adr_dp.shape)

    ##
    ind_adr_gene = torch.ones_like(rank_adr_gene)
    ind_dp_gene = torch.ones_like(rank_dp_gene)
    ind_adr_dp = torch.ones_like(rank_adr_dp)

    ind_adr_gene[list(I_narows_adr), :] = 0
    ind_adr_gene[:, list(I_narows_gene)] = 0

    # nan_rows_dp2 = [x - conv_dp.shape[0] for x in nan_rows_dp]
    ind_dp_gene[list(I_narows_dp), :] = 0
    ind_dp_gene[:, list(I_narows_gene)] = 0

    ind_adr_dp[list(I_narows_adr), :] = 0
    ind_adr_dp[:, list(I_narows_dp)] = 0

    rank_adr_gene = rank_adr_gene * ind_adr_gene
    rank_dp_gene = rank_dp_gene * ind_dp_gene
    rank_adr_dp = rank_adr_dp * ind_adr_dp

    ## rank Sum
    rank_sum_adr_gene = rank_sum_adr_gene + rank_adr_gene
    rank_sum_dp_gene = rank_sum_dp_gene + rank_dp_gene
    rank_sum_adr_dp = rank_sum_adr_dp + rank_adr_dp

    index_adr_gene = index_adr_gene + ind_adr_gene
    index_dp_gene = index_dp_gene + ind_dp_gene
    index_adr_dp = index_adr_dp + ind_adr_dp


In [10]:
rank_mean_adr_gene = rank_sum_adr_gene / index_adr_gene.clamp(min=1e-6)
rank_mean_dp_gene = rank_sum_dp_gene / index_dp_gene.clamp(min=1e-6)
rank_mean_adr_dp = rank_sum_adr_dp / index_adr_dp.clamp(min=1e-6)

In [11]:
rank_mean_adr_gene_df = rank_mean_adr_gene.cpu().numpy()
rank_mean_adr_gene_df = pd.DataFrame(rank_mean_adr_gene_df, 
                          index=embedding_adr_aligned.index, 
                          columns=embedding_gene_aligned.index)

rank_mean_dp_gene_df = rank_mean_dp_gene.cpu().numpy()
rank_mean_dp_gene_df = pd.DataFrame(rank_mean_dp_gene_df, 
                          index=embedding_dp_aligned.index, 
                          columns=embedding_gene_aligned.index)

rank_mean_adr_dp_df = rank_mean_adr_dp.cpu().numpy()
rank_mean_adr_dp_df = pd.DataFrame(rank_mean_adr_dp_df, 
                          index=embedding_adr_aligned.index, 
                          columns=embedding_dp_aligned.index)

In [12]:
index_mapping_adr = conv_adr.set_index('Nodes')['Symbole'].to_dict()
index_mapping_dp = conv_dp.set_index('Nodes')['Symbole'].to_dict()
index_mapping_gene = conv_gene.set_index('Nodes')['Symbole'].to_dict()

In [13]:
rank_mean_adr_gene_df.index = rank_mean_adr_gene_df.index.map(index_mapping_adr)
rank_mean_dp_gene_df.index = rank_mean_dp_gene_df.index.map(index_mapping_dp)
rank_mean_adr_dp_df.index = rank_mean_adr_dp_df.index.map(index_mapping_adr)

rank_mean_adr_gene_df = rank_mean_adr_gene_df.rename(columns=index_mapping_gene)
rank_mean_dp_gene_df = rank_mean_dp_gene_df.rename(columns=index_mapping_gene)
rank_mean_adr_dp_df = rank_mean_adr_dp_df.rename(columns=index_mapping_dp)

In [14]:
rank_mean_adr_gene_df.to_csv(os.path.join(result_dir, 'adr_gene.csv'))
rank_mean_dp_gene_df.to_csv(os.path.join(result_dir, 'dp_gene.csv'))
rank_mean_adr_dp_df.to_csv(os.path.join(result_dir, 'adr_dp.csv'))