In [1]:
import glob, h5py, os, re, sys, time
from pathlib import Path
from filelock import FileLock

import numpy as np
import pandas as pd
from multiprocess import Pool
from tqdm.notebook import tqdm
from six.moves import cPickle
import sh
from tqdm import tqdm

from acme.kmer import kmer_featurization
from acme import interval
from acme import utils

2022-10-07 08:37:29.735676: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR.joinpath("data/atac/cell_line_testsets")
evaluation_path = BASE_DIR.joinpath("data/atac/atac_model_pearson.csv")
saliency_dir = BASE_DIR.joinpath("data/atac/saliency_repo")
saliency_subset_dir = BASE_DIR.joinpath("data/atac/saliency_subset")

# cell line paths
cell_line_dict = {
    "A549": f"{DATA_DIR}/cell_line_8.h5",
    "HCT116": f"{DATA_DIR}/cell_line_9.h5",
    "GM12878": f"{DATA_DIR}/cell_line_7.h5",
    "K562": f"{DATA_DIR}/cell_line_5.h5",
    "PC-3": f"{DATA_DIR}/cell_line_13.h5",
    "HepG2": f"{DATA_DIR}/cell_line_2.h5"
}

model_dict = {
    "new_models_Residual_32_task_Exp": "residual_32_task_exp",
    "new_models_CNN_1_all_Exp": "cnn_base_all_exp",
    "new_models_Residual_32_all_Exp": "residual_32_all_exp",
    "new_models_CNN_32_all_Exp": "cnn_32_all_exp",
    "new_models_Residual_32_task_ReLU": "residual_32_task_relu",
    "binary_basenji_binary_exp": "binary_basenji_exp",
    "bpnet_augmentation_48": "bpnet",
    "binary_residual_binary": "binary_residual_relu",
    "new_models_Residual_1_task_Exp": "residual_base_task_exp",
    "binary_basenji_binary": "binary_basenji_relu",
    "new_models_CNN_32_task_Exp": "cnn_32_task_exp",
    "new_models_Residual_1_all_Exp": "residual_base_all_exp",
    "binary_residual_binary_exp": "binary_residual_exp",
    "new_models_CNN_1_task_ReLU": "cnn_base_task_relu",
    "binary_conv_binary_exp": "binary_cnn_exp",
    "binary_basset_exp": "binary_basset_exp",
    "new_models_Residual_1_all_ReLU": "residual_base_all_relu",
    "binary_basset": "binary_basset_relu",
    "new_models_CNN_32_task_ReLU": "cnn_32_task_relu",
    "new_models_CNN_32_all_ReLU": "cnn_32_all_relu",
    "new_models_CNN_1_task_Exp": "cnn_base_task_exp",
    "new_models_Residual_1_task_ReLU": "residual_base_task_relu",
    "basenji_v2_binloss_basenji_v2": "basenji_v2_binloss_relu",
    "new_models_Residual_32_all_ReLU": "residual_32_all_relu",
    "new_models_CNN_1_all_ReLU": "cnn_base_all_relu",
    "binary_conv_binary": "binary_cnn_relu"
}

print(BASE_DIR, evaluation_path, DATA_DIR, saliency_dir)

/shared/data00/acme /shared/data00/acme/data/atac/atac_model_pearson.csv /shared/data00/acme/data/atac/cell_line_testsets /shared/data00/acme/data/atac/saliency_repo


In [3]:
# df = utils.make_directories_atac(out_dir=out_dir, source_paths=saliency_dir)

In [3]:
df = utils.get_model_info(saliency_dir=saliency_subset_dir)

In [5]:
df

Unnamed: 0,model,cell_line,cell_line_dir,attr_map_path,task_type,activation
0,binary_basset_relu,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,relu
1,binary_residual_relu,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,relu
2,binary_cnn_relu,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,relu
3,binary_basenji_relu,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,relu
4,binary_cnn_exp,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,exp
5,binary_basenji_exp,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,exp
6,binary_basset_exp,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,exp
7,binary_residual_exp,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,binary,exp
8,cnn_32_all_relu,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,quantitative,relu
9,cnn_32_task_relu,GM12878,/shared/data00/acme/data/atac/cell_line_testse...,/shared/data00/acme/data/atac/saliency_subset/...,quantitative,relu


In [6]:
out_dir = BASE_DIR.joinpath("data/atac/saliency_subset")

cell_subset = ["GM12878"]


for i in range(len(df)):


    model = df.iloc[i]['model']
    model = model_dict[model]
    attr_map_path = df.iloc[i]['attr_map_path']
    cell_line_dir = cell_line_dict[df.iloc[i]['cell_line']]
    cell_line = df.iloc[i]['cell_line']

    task_type = df.iloc[i]['task_type']
    activation = df.iloc[i]['activation']


    if(cell_line in cell_subset):

            out_path = f"{out_dir}/{cell_line}/{task_type}/{activation}" #/{model}"
            Path(f"{out_path}").mkdir(parents=True, exist_ok=True)

            # copy the files to the subset path
#             sh.cp(attr_map_path, f"{out_path}/{model}.pickle")

# Evaluate consistency

In [7]:
cell_subset = ["GM12878"]


for i in tqdm(range(len(df))):


    model = df.iloc[i]['model']
    model = model_dict[model]
    attr_map_path = df.iloc[i]['attr_map_path']
    cell_line_dir = cell_line_dict[df.iloc[i]['cell_line']]
    cell_line = df.iloc[i]['cell_line']

    task_type = df.iloc[i]['task_type']
    activation = df.iloc[i]['activation']


    if(cell_line in cell_subset):

            new_path = f"{out_dir}/{cell_line}/{task_type}/{activation}/{model}.pickle"
            
            a1, _, _ = utils.load_data(attr_map_path=attr_map_path, cell_line_dir=cell_line_dir)
            a2, _, _ = utils.load_data(attr_map_path=new_path, cell_line_dir=cell_line_dir)
            
            if not (np.array_equal(a1, a2)):
                print(f"Not equal for model: {attr_map_path}")

        
            



  attr_map = attr_map / np.sqrt(np.sum(np.sum(np.square(attr_map), axis=-1, keepdims=True), axis=-2, keepdims=True))
 60%|█████████████████████████████████████████████████████████████████████▎                                             | 94/156 [00:20<00:14,  4.31it/s]

Not equal for model: /shared/data00/chandana/acme/data/atac/saliency_repo/binary_basset_exp_GM12878.pickle


 66%|███████████████████████████████████████████████████████████████████████████▎                                      | 103/156 [00:23<00:14,  3.71it/s]

Not equal for model: /shared/data00/chandana/acme/data/atac/saliency_repo/binary_basset_GM12878.pickle


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 156/156 [00:33<00:00,  4.64it/s]


In [None]:
# This is because these models contain NaNs!

# Rename models in the evaluation results

In [6]:
df = pd.read_csv("/shared/data00/acme/data/atac/atac_model_pearson.csv")
df

Unnamed: 0.1,Unnamed: 0,model_name,A549,HCT116,GM12878,K562,PC-3,HepG2
0,0,binary_basenji_binary_exp,0.631905,0.634136,0.556909,0.610139,0.498113,0.612955
1,1,binary_basset,0.644551,0.623511,0.540928,0.592135,0.492168,0.586155
2,2,binary_conv_binary,0.64881,0.6225,0.521219,0.605254,0.505533,0.598627
3,3,binary_basset_exp,0.631002,0.62031,0.51473,0.598176,0.500083,0.569723
4,4,binary_residual_binary_exp,0.63987,0.645721,0.606411,0.606989,0.578999,0.635519
5,5,binary_residual_binary,0.638054,0.651104,0.606609,0.612077,0.578736,0.645325
6,6,binary_basenji_binary,0.592449,0.592355,0.496773,0.51407,0.450311,0.550426
7,7,binary_conv_binary_exp,0.641775,0.64346,0.580139,0.607368,0.552579,0.621994
8,8,new_models_Residual_1_task_Exp,0.754242,0.719093,0.66957,0.687969,0.584345,0.673214
9,9,new_models_Residual_1_task_ReLU,0.735452,0.704582,0.645654,0.683041,0.538408,0.667204


In [8]:
new_model_names = [model_dict[i] for i in df["model_name"]]

In [9]:
df.insert(1, "new_model_name", new_model_names)
df

Unnamed: 0.1,Unnamed: 0,new_model_name,model_name,A549,HCT116,GM12878,K562,PC-3,HepG2
0,0,binary_basenji_exp,binary_basenji_binary_exp,0.631905,0.634136,0.556909,0.610139,0.498113,0.612955
1,1,binary_basset_relu,binary_basset,0.644551,0.623511,0.540928,0.592135,0.492168,0.586155
2,2,binary_cnn_relu,binary_conv_binary,0.64881,0.6225,0.521219,0.605254,0.505533,0.598627
3,3,binary_basset_exp,binary_basset_exp,0.631002,0.62031,0.51473,0.598176,0.500083,0.569723
4,4,binary_residual_exp,binary_residual_binary_exp,0.63987,0.645721,0.606411,0.606989,0.578999,0.635519
5,5,binary_residual_relu,binary_residual_binary,0.638054,0.651104,0.606609,0.612077,0.578736,0.645325
6,6,binary_basenji_relu,binary_basenji_binary,0.592449,0.592355,0.496773,0.51407,0.450311,0.550426
7,7,binary_cnn_exp,binary_conv_binary_exp,0.641775,0.64346,0.580139,0.607368,0.552579,0.621994
8,8,residual_base_task_exp,new_models_Residual_1_task_Exp,0.754242,0.719093,0.66957,0.687969,0.584345,0.673214
9,9,residual_base_task_relu,new_models_Residual_1_task_ReLU,0.735452,0.704582,0.645654,0.683041,0.538408,0.667204


In [10]:
# Save to csv
# df.to_csv("/shared/data00/acme/data/atac/atac_model_pearson_copy.csv", index=None)

In [11]:
# double check for accuracy
pd.read_csv("/shared/data00/acme/data/atac/atac_model_pearson_copy.csv")

Unnamed: 0.1,Unnamed: 0,new_model_name,model_name,A549,HCT116,GM12878,K562,PC-3,HepG2
0,0,binary_basenji_exp,binary_basenji_binary_exp,0.631905,0.634136,0.556909,0.610139,0.498113,0.612955
1,1,binary_basset_relu,binary_basset,0.644551,0.623511,0.540928,0.592135,0.492168,0.586155
2,2,binary_cnn_relu,binary_conv_binary,0.64881,0.6225,0.521219,0.605254,0.505533,0.598627
3,3,binary_basset_exp,binary_basset_exp,0.631002,0.62031,0.51473,0.598176,0.500083,0.569723
4,4,binary_residual_exp,binary_residual_binary_exp,0.63987,0.645721,0.606411,0.606989,0.578999,0.635519
5,5,binary_residual_relu,binary_residual_binary,0.638054,0.651104,0.606609,0.612077,0.578736,0.645325
6,6,binary_basenji_relu,binary_basenji_binary,0.592449,0.592355,0.496773,0.51407,0.450311,0.550426
7,7,binary_cnn_exp,binary_conv_binary_exp,0.641775,0.64346,0.580139,0.607368,0.552579,0.621994
8,8,residual_base_task_exp,new_models_Residual_1_task_Exp,0.754242,0.719093,0.66957,0.687969,0.584345,0.673214
9,9,residual_base_task_relu,new_models_Residual_1_task_ReLU,0.735452,0.704582,0.645654,0.683041,0.538408,0.667204
