In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import pymde
import hdbscan
from sklearn.preprocessing import StandardScaler

In [2]:
p_nw = []
p_net_nw = []
loss_nw = []
loss_net_nw = []
bool_nw = []

for network_name in ["DAGMA_thresholdAdaptive"]:
    
    # measurements across 10 random sample split
    p_rs = []
    p_net_rs = []
    loss_rs = []
    loss_net_rs = []
    bool_rs = []
    
    for rs in range(10):
        # read data
        X_train = pd.read_csv("../../result/input_perturb_phyloP/%d/X_train_stratified" % rs, sep="\t", index_col=0).values
        X_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/X_valid_stratified" % rs, sep="\t", index_col=0).values
        X_test = pd.read_csv("../../result/input_perturb_phyloP/%d/X_test_stratified" % rs, sep="\t", index_col=0).values
        Y_test = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
        Y_test_gene = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).index

        test_mask = np.concatenate([[False] * len(X_train), [False] * len(X_valid), [True] * len(X_test)])

        # mask of whether a test node is in the DAGMA graph
        dag = pd.read_csv("../../result/network_perturb_phyloP/DAGMA_thresholdAdaptive.tsv", sep="\t", header=None)
        id2genes = pd.read_csv("../../result/network_perturb_phyloP/valid_genes", sep="\t").set_index("ID")['genes'].to_dict()
        dag[0] = dag[0].map(id2genes)
        dag[1] = dag[1].map(id2genes)
        dag_genes = list(set.union(set(dag[0]), set(dag[1])))
        test_inDAG = Y_test_gene.isin(dag_genes)

        # measurements across 10 random init repeats
        p_rep = []
        p_net_rep = []
        loss_rep = []
        loss_net_rep = []

        for rep in range(10):
            prefix = "../../result/model_perturb_phyloP/%s/%d/model%d" % (network_name, rs, rep)
            with open(prefix+".para", "rb") as f:
                best_params, X1, out, loss1, loss2 = pickle.load(f)
            Y_pred = out[test_mask].reshape(-1)

            loss_rep.append(F.mse_loss(torch.tensor(Y_test), torch.tensor(Y_pred)).item())
            loss_net_rep.append(F.mse_loss(torch.tensor(Y_test[test_inDAG]), torch.tensor(Y_pred[test_inDAG])).item())
            p_rep.append(pearsonr(Y_test, Y_pred)[0])
            p_net_rep.append(pearsonr(Y_test[test_inDAG], Y_pred[test_inDAG])[0])
        bool_rep = ~np.isnan(np.array(p_net_rep))

        p_rs.append(p_rep)
        p_net_rs.append(p_net_rep)
        loss_rs.append(loss_rep)
        loss_net_rs.append(loss_net_rep)
        bool_rs.append(bool_rep)
        
    p_nw.append(p_rs)
    p_net_nw.append(p_net_rs)
    loss_nw.append(loss_rs)
    loss_net_nw.append(loss_net_rs)
    bool_nw.append(bool_rs)

p_nw = np.array(p_nw)
p_net_nw = np.array(p_net_nw)
loss_nw = np.array(loss_nw)
loss_net_nw = np.array(loss_net_nw)
bool_nw = np.array(bool_nw)



In [3]:
np.nanargmin(loss_nw[0].flatten())

89

In [4]:
network_name = "DAGMA_thresholdAdaptive"
rs = 8
rep = 9

# read data
X_train = pd.read_csv("../../result/input_perturb_phyloP/%d/X_train_stratified" % rs, sep="\t", index_col=0).values
X_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/X_valid_stratified" % rs, sep="\t", index_col=0).values
X_test = pd.read_csv("../../result/input_perturb_phyloP/%d/X_test_stratified" % rs, sep="\t", index_col=0).values
Y_test = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
Y_test_gene = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).index

test_mask = np.concatenate([[False] * len(X_train), [False] * len(X_valid), [True] * len(X_test)])

# mask of whether a test node is in the DAGMA graph
dag = pd.read_csv("../../result/network_perturb_phyloP/DAGMA_thresholdAdaptive.tsv", sep="\t", header=None)
id2genes = pd.read_csv("../../result/network_perturb_phyloP/valid_genes", sep="\t").set_index("ID")['genes'].to_dict()
dag[0] = dag[0].map(id2genes)
dag[1] = dag[1].map(id2genes)
dag_genes = list(set.union(set(dag[0]), set(dag[1])))
test_inDAG = Y_test_gene.isin(dag_genes)

prefix = "../../result/model_perturb_phyloP/%s/%d/model%d" % (network_name, rs, rep)
with open(prefix+".para", "rb") as f:
    best_params, X1, out, loss1, loss2 = pickle.load(f)
Y_pred = out[test_mask].reshape(-1)

In [5]:
# z-norm expression profile from given data
X = X1
X_norm = StandardScaler().fit_transform(X)

# # init with spectral embed
# se = SpectralEmbedding(n_components=20, affinity='nearest_neighbors', n_neighbors=7, eigen_solver='arpack')
# X_norm_se = se.fit_transform(X_norm)

# mde embed
mde = pymde.preserve_neighbors(X_norm, embedding_dim=20, n_neighbors=7, repulsive_fraction=5)
X_norm_se_mde = mde.embed()
pd.DataFrame(X_norm_se_mde).to_csv("../../result/emb/mde_dagma.txt", sep="\t", header=False, index=False)

# clustering
clusterer = hdbscan.HDBSCAN(metric='euclidean', min_cluster_size=10, min_samples=10, cluster_selection_method='leaf')
cluster_label_X = clusterer.fit_predict(X_norm_se_mde)
pd.DataFrame(cluster_label_X).to_csv("../../result/emb/hdbscan_dagma.txt", sep="\t", header=False, index=False)

In [6]:
genes = pd.read_csv("../../result/input_perturb_phyloP/%d/genes_train_valid_test_stratified" % rs, header=None)

In [7]:
genes[1] = cluster_label_X

In [8]:
genes[1].nunique()

83

In [9]:
genes[1].value_counts()

1
-1     550
 41     56
 61     48
 10     48
 31     39
      ... 
 60     11
 23     11
 45     11
 81     10
 80     10
Name: count, Length: 83, dtype: int64

In [10]:
genes.to_csv("../../result/emb/hdbscan_dagma.txt", sep="\t", index=False, header=False)