In [31]:
import os
import csv
import json
import pickle

import torch
import numpy as np
import pandas as pd
import hickle as hkl
from tqdm import tqdm
import deepchem as dc
from rdkit import Chem
import scipy.sparse as sp
from torch_geometric.utils import dense_to_sparse

In [2]:
TCGA_label_set = [
    "ALL","BLCA","BRCA","CESC","DLBC","LIHC","LUAD",
    "ESCA","GBM","HNSC","KIRC","LAML","LCML","LGG",
    "LUSC","MESO","MM","NB","OV","PAAD","SCLC","SKCM",
    "STAD","THCA",'COAD/READ'
    ]

In [3]:
BASE_DIR = os.path.join("..", "data", "source")

drug_info_file = os.path.join(BASE_DIR, "GDSC", "1.Drug_listMon Jun 24 09_00_55 2019.csv")
cell_line_info_file = os.path.join(BASE_DIR, "CCLE", "Cell_lines_annotations_20181226.txt")
drug_feature_dir = os.path.join(BASE_DIR, "GDSC", "drug_graph_feat")
genomic_mutation_file = os.path.join(BASE_DIR, "CCLE", "genomic_mutation_34673_demap_features.csv")
cancer_response_exp_file = os.path.join(BASE_DIR, "CCLE", "GDSC_IC50.csv")
gene_expression_file = os.path.join(BASE_DIR, "CCLE", "genomic_expression_561celllines_697genes_demap_features.csv")
methylation_file = os.path.join(BASE_DIR, "CCLE", "genomic_methylation_561celllines_808genes_demap_features.csv")
IC50_thred_file = os.path.join(BASE_DIR, "CCLE", "IC50_thred.txt")

max_atoms = 100

## Drug to pubchem id

In [4]:
reader = csv.reader(open(drug_info_file, "r"))

rows = [row for row in reader]

drug2pubchemid = {item[0]: item[5] for item in rows if item[5].isdigit()}
name2pubchemid = {item[1]: item[5] for item in rows if item[5].isdigit()}
drug_names = {item.strip() for item in open(IC50_thred_file).readlines()[0].split("\t")}
ic50_threds = {float(item.strip()) for item in open(IC50_thred_file).readlines()[1].split("\t")}

# pubchemid as keys
drug2thred = {name2pubchemid[a]: b for a, b in zip(drug_names, ic50_threds) if a in name2pubchemid.keys()}

## cell line to cancer type

In [5]:
cellline2cancertype = {}

for line in open(cell_line_info_file).readlines()[1:]:
    celline_id = line.split("\t")[1].strip()
    TCGA_label = line.split("\t")[-1].strip()

    cellline2cancertype[celline_id] = TCGA_label

In [6]:
use_treshold = True

mutation_feature = pd.read_csv(genomic_mutation_file, sep=",",header=0, index_col=[0])

drug_pubchem_id_set = []
drug_feature = {}
for graph_file in os.listdir(drug_feature_dir):
    pubchem_id = graph_file.split(".")[0]
    
    drug_pubchem_id_set.append(pubchem_id)
    feat_mat, adj_list, degree_list = hkl.load(os.path.join(drug_feature_dir, graph_file))

    drug_feature[pubchem_id] = [feat_mat, adj_list, degree_list]

gene_expression_feature = pd.read_csv(gene_expression_file, sep=",", header=0, index_col=[0])

# use gene expression cell lines as reference and only extract mutation entries with existing mutation entries
mutation_feature = mutation_feature.loc[list(gene_expression_feature.index)]

methylation_feature = pd.read_csv(methylation_file, sep=",", header=0, index_col=[0])

assert len(gene_expression_feature) == len(mutation_feature) == len(methylation_feature)

# experiment data is the interaction between drugs and cell lines with the IC50 values per drug-cell line pair
experiment_data = pd.read_csv(cancer_response_exp_file, sep=",", header=0, index_col=[0])

# valid drugs are drugs with pubchem ids
valid_drugs = [d for d in experiment_data.index if d.split(":")[-1] in drug2pubchemid.keys()]
experiment_data = experiment_data.loc[valid_drugs]

data_idx = []
for drug in experiment_data.index:
    drug_id = drug.split(":")[-1]
    
    for cellline in experiment_data.columns:
        pubchem_id = drug2pubchemid[drug_id]
        if str(pubchem_id) in drug_pubchem_id_set and cellline in mutation_feature.index:
            ln_ic50 = experiment_data.loc[drug, cellline]
            if not np.isnan(ln_ic50) and cellline in cellline2cancertype.keys():
                ln_ic50 = float(ln_ic50)

                if use_treshold and pubchem_id in drug2thred.keys():
                    label = 1 if ln_ic50 < drug2thred[pubchem_id] else 0

                else:
                    label = 1 if ln_ic50 < -2 else 0

                # cellline, pubchemid of drug, label, ic50, cancer code
                data_idx.append((cellline, pubchem_id, label, ln_ic50, cellline2cancertype[cellline]))

nb_celllines = len(set([item[0] for item in data_idx]))
nb_drugs = len(set([item[1] for item in data_idx]))
print('%d instances across %d cell lines and %d drugs were generated.'%(len(data_idx),nb_celllines,nb_drugs))



107446 instances across 561 cell lines and 223 drugs were generated.


In [7]:
mutation_feature.head()

Unnamed: 0,TNFRSF14.1:2488170,TNFRSF14.1:2489805,TNFRSF14.1:2489824,TNFRSF14.1:2489856,TNFRSF14.1:2489868,TNFRSF14.1:2491328,TNFRSF14.1:2491335,TNFRSF14.1:2491373,TNFRSF14.1:2491403,TNFRSF14.1:2492117,...,MTCP1.23:154293902,MTCP1.23:154293909,MTCP1.23:154293951,MTCP1.23:154293978,MTCP1.23:154294034,MTCP1.23:154294203,MTCP1.23:154294207,MTCP1.23:154294278,MTCP1.23:154294279,MTCP1.23:154298967
ACH-000828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000560,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACH-000562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
set([c.split(".")[-1].split(":")[0] for c in mutation_feature.columns])

{'1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '21',
 '22',
 '23',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9'}

In [9]:
drug_feature.keys()

dict_keys(['10113978', '560326', '24785538', '46907787', '5291', '387447', '42642645', '56965967', '24894414', '3385', '644215', '11844351', '46885626', '4993', '11373846', '17755052', '5289247', '3218', '9938202', '10077147', '9863776', '9943465', '5494449', '10172943', '5384616', '44450571', '3796', '9956119', '11713159', '16038120', '11707110', '46883536', '176158', '11433190', '10384072', '31703', '24825971', '44632017', '11625818', '216239', '300471', '9868037', '176167', '78243717', '11327430', '54685215', '20635522', '3062316', '11634725', '11754511', '10109823', '5460769', '6914657', '5278396', '9967941', '9956222', '24180719', '462382', '9907093', '11485656', '25022668', '6918454', '44462760', '3463933', '6852167', '66577006', '46844147', '84691', '9903786', '85668777', '208908', '444795', '5394', '11364421', '49836027', '53302361', '5311497', '9874913', '159324', '5328940', '49806720', '5746', '7251185', '24951314', '10096043', '9810884', '9914412', '10184653', '2375', '44637

In [10]:
gene_expression_feature.head()

Unnamed: 0,LASP1,HOXA11,CREBBP,ETV1,GAS7,CD79B,PAX7,BTK,BRCA1,WAS,...,NCKIPSD,MTCP1,DDX3X,FANCG,SSX2,ETV5,CEBPA,LSM14A,CUX1,C15orf65
ACH-000828,9.393476,0.042644,3.93546,0.871844,0.070389,0.084064,0.0,0.056584,3.339137,0.15056,...,4.071248,3.119356,6.849374,4.355439,0.0,0.137504,1.769772,6.501598,4.700994,2.295723
ACH-000568,7.638074,0.056584,3.427606,0.201634,1.794936,0.739848,0.042644,0.333424,3.193772,0.815575,...,4.084064,4.634593,5.671576,5.525443,0.056584,2.195348,0.124328,5.811214,3.590961,1.550901
ACH-000560,5.728193,6.001352,5.032542,5.018812,0.432959,0.250962,0.0,0.263034,4.678635,1.292782,...,5.05745,3.468583,6.617798,6.425761,0.0,5.203201,1.922198,7.581351,5.320124,1.438293
ACH-000561,6.037163,1.565597,4.262283,0.790772,1.257011,0.028569,0.056584,0.042644,3.44228,0.286881,...,3.400538,3.407353,6.154211,4.794936,0.0,3.984589,1.028569,6.533719,5.132166,2.144046
ACH-000562,7.050502,0.014355,3.360364,0.879706,0.084064,0.137504,0.0,0.042644,4.939227,0.286881,...,4.125982,4.047015,6.281884,5.853497,0.056584,3.757023,0.056584,5.912171,4.877744,0.815575


In [11]:
methylation_feature.head()

Unnamed: 0,SKI_1_2159133_2160133,TNFRSF14_1_2486803_2487803,PRDM16_1_2984741_2985741,RPL22_1_6259679_6260679,CAMTA1_1_6844383_6845383,MTOR_1_11322608_11323608,PRDM2_1_14025734_14026734,PRDM2_1_14074875_14075875,CASP9_1_15850940_15851940,CASP9_1_15851285_15852285,...,BCORL1_X_129138163_129139163,ELF4_X_129244475_129245475,ELF4_X_129244688_129245688,GPC3_X_133119673_133120673,ATP2B3_X_152800579_152801579,FLNA_X_153603006_153604006,RPL10_X_153625405_153626405,RPL10_X_153625712_153626712,RPL10_X_153625837_153626837,MTCP1_X_154299547_154300547
ACH-000698,0.0,0.00739,0.04799,0.14687,0.00553,0.54138,0.0716,0.0004,0.01836,0.03461,...,1.0,0.0,0.0,0.02007,0.01706,0.16926,0.00145,0.00242,0.00242,0.01924
ACH-000489,0.0,0.00363,0.64891,1.0,0.00538,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.32058,0.72527,0.37572,0.0,0.0,0.0,0.0
ACH-000522,0.00143,0.47465,0.27715,0.19441,0.00858,0.40555,0.21625,0.01367,0.00338,0.00338,...,1.0,0.18059,0.22562,0.054325,0.6083,0.65268,0.82345,0.59039,0.59039,0.07546
ACH-000613,0.0,0.0,0.05112,0.10686,0.00268,0.28618,0.07706,0.0,0.0,0.0,...,0.01066,0.0,0.0,0.01066,0.53086,0.00918,0.0,0.00448,0.00448,0.0
ACH-000614,0.00118,0.00471,0.15165,0.07407,0.0105,0.51629,0.03292,0.00405,0.0,0.0,...,1.0,0.07273,0.07273,0.10699,0.69472,0.28947,0.05354,0.04306,0.04306,0.00114


In [12]:
data_idx

[('ACH-000070', '176870', 0, 0.693305, 'ALL'),
 ('ACH-000137', '176870', 0, 2.580268, 'GBM'),
 ('ACH-000008', '176870', 0, 2.557837, 'SKCM'),
 ('ACH-000740', '176870', 0, 0.290013, 'HNSC'),
 ('ACH-000697', '176870', 0, 1.11025, 'DLBC'),
 ('ACH-000157', '176870', 0, 2.772212, 'DLBC'),
 ('ACH-000105', '176870', 0, 1.738326, 'ALL'),
 ('ACH-000269', '176870', 0, 3.510624, 'GBM'),
 ('ACH-000838', '176870', 0, 2.784917, 'MM'),
 ('ACH-000245', '176870', 0, 1.163577, 'DLBC'),
 ('ACH-000927', '176870', 0, 3.106225, 'BRCA'),
 ('ACH-000432', '176870', 0, 1.435513, 'LCML'),
 ('ACH-000009', '176870', 0, 1.729286, 'COAD/READ'),
 ('ACH-000440', '176870', 0, 3.076768, 'DLBC'),
 ('ACH-000264', '176870', 0, 3.109938, 'LUAD'),
 ('ACH-000464', '176870', 0, 3.586981, 'GBM'),
 ('ACH-000136', '176870', 0, 3.070973, 'NB'),
 ('ACH-000249', '176870', 0, 3.886257, 'COAD/READ'),
 ('ACH-000641', '176870', 0, 2.719367, 'LAML'),
 ('ACH-000803', '176870', 0, 1.804919, 'SCLC'),
 ('ACH-000864', '176870', 0, 3.64169, 'U

In [None]:
def normalize_adj(adj):
    adj = adj + np.eye(adj.shape[0])
    degree = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0).toarray()
    adj_norm = adj.dot(degree).transpose().dot(degree)

    return adj_norm


def calculate_graph_feat(feat_mat, adj_list):
    assert feat_mat.shape[0] == len(adj_list)
    feat = np.zeros((max_atoms, feat_mat.shape[-1]), dtype="float32")
    adj_mat = np.zeros((max_atoms, max_atoms), dtype="float32")
    feat[:feat_mat.shape[0], :] = feat_mat

    for idx in range(len(adj_list)):
        nodes = adj_list[idx]

        for neighbor in nodes:
            adj_mat[idx, int(neighbor)] = 1

    assert np.allclose(adj_mat, adj_mat.T)
    adj_main = adj_mat[:len(adj_list), :len(adj_list)]
    adj_complement = adj_mat[len(adj_list):, len(adj_list):]

    norm_adj_main = normalize_adj(adj_main)
    norm_adj_complement = normalize_adj(adj_complement)

    adj_mat[:len(adj_list), :len(adj_list)] = norm_adj_main
    adj_mat[len(adj_list):, len(adj_list):] = norm_adj_complement
    graph_features = {"feature": feat, "adj_mat": adj_mat}
    
    return graph_features


def feature_extract(
    data_idx,
    drug_feature,
    mutation_feature,
    gene_expression_feature,
    methylation_feature
    ):

    nb_instance = len(data_idx)
    nb_mutation_feature = mutation_feature.shape[1]
    nb_gene_expression_feature = gene_expression_feature.shape[1]
    nb_methylation_feature = methylation_feature.shape[1]
    
    instance_ids = []
    drug_data = [[] for _ in range(nb_instance)]
    mutation_data = np.zeros((nb_instance, 1, nb_mutation_feature), dtype="float32")
    gene_expression_data = np.zeros((nb_instance, nb_gene_expression_feature), dtype="float32")
    methylation_data = np.zeros((nb_instance, nb_methylation_feature), dtype="float32")

    discrete_targets = np.zeros(nb_instance, dtype="int16")
    continuous_targets = np.zeros(nb_instance, dtype="float32")

    for idx in tqdm(range(nb_instance), desc="Extracting Features"):
        cell_line_id, pubchem_id, label, ic50, cancer_type = data_idx[idx]
        # instance_ids.append(f"cell_line_{cell_line_id}-cancer_type_{cancer_type}-pubchem_{pubchem_id}-ic50_{ic50:.4f}-label_{label}")
        instance_ids.append(f"cell_line_{cell_line_id}-cancer_type_{cancer_type.replace('/', '-')}-pubchem_{pubchem_id}")

        feat_mat, adj_list, _ = drug_feature[str(pubchem_id)]
        
        drug_data[idx] = calculate_graph_feat(feat_mat, adj_list)
        mutation_data[idx, 0, :] = mutation_feature.loc[cell_line_id].values
        gene_expression_data[idx, :] = gene_expression_feature.loc[cell_line_id].values
        methylation_data[idx, :] = methylation_feature.loc[cell_line_id].values

        discrete_targets[idx] = label
        continuous_targets[idx] = ic50

    return instance_ids, drug_data, mutation_data, gene_expression_data, methylation_data, discrete_targets, continuous_targets

In [14]:
instance_ids, drug_data, mutation_data, gene_expression_data, methylation_data, discrete_targets, continuous_targets = feature_extract(
    data_idx,
    drug_feature,
    mutation_feature,
    gene_expression_feature,
    methylation_feature
)

Extracting Features: 100%|██████████| 107446/107446 [01:04<00:00, 1661.11it/s]


In [23]:
test = drug_data[0]["adj_mat"]

test

array([[0.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0.5, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0.5, ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 1. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ]],
      shape=(100, 100), dtype=float32)

In [15]:
instance_ids

['cell_line_ACH-000070-cancer_type_ALL-pubchem_176870',
 'cell_line_ACH-000137-cancer_type_GBM-pubchem_176870',
 'cell_line_ACH-000008-cancer_type_SKCM-pubchem_176870',
 'cell_line_ACH-000740-cancer_type_HNSC-pubchem_176870',
 'cell_line_ACH-000697-cancer_type_DLBC-pubchem_176870',
 'cell_line_ACH-000157-cancer_type_DLBC-pubchem_176870',
 'cell_line_ACH-000105-cancer_type_ALL-pubchem_176870',
 'cell_line_ACH-000269-cancer_type_GBM-pubchem_176870',
 'cell_line_ACH-000838-cancer_type_MM-pubchem_176870',
 'cell_line_ACH-000245-cancer_type_DLBC-pubchem_176870',
 'cell_line_ACH-000927-cancer_type_BRCA-pubchem_176870',
 'cell_line_ACH-000432-cancer_type_LCML-pubchem_176870',
 'cell_line_ACH-000009-cancer_type_COAD-READ-pubchem_176870',
 'cell_line_ACH-000440-cancer_type_DLBC-pubchem_176870',
 'cell_line_ACH-000264-cancer_type_LUAD-pubchem_176870',
 'cell_line_ACH-000464-cancer_type_GBM-pubchem_176870',
 'cell_line_ACH-000136-cancer_type_NB-pubchem_176870',
 'cell_line_ACH-000249-cancer_type_

In [16]:
idx = 0
instance_ids[idx]

'cell_line_ACH-000070-cancer_type_ALL-pubchem_176870'

In [17]:
drug_data[idx]

{'feature': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], shape=(100, 75), dtype=float32),
 'adj_mat': array([[0.5, 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0.5, 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0.5, ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 1. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 1. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 1. ]],
       shape=(100, 100), dtype=float32)}

In [18]:
type(drug_data[idx]["adj_mat"])

numpy.ndarray

In [19]:
type(mutation_data[idx])

numpy.ndarray

In [20]:
type(gene_expression_data[idx])

numpy.ndarray

In [21]:
type(methylation_data[idx])

numpy.ndarray

In [22]:
discrete_targets[idx]

np.int16(0)

In [23]:
continuous_targets[idx]

np.float32(0.693305)

In [24]:
discrete_target_dict = {identifier: target for identifier, target in zip(instance_ids, discrete_targets.tolist())}
continuous_target_dict = {identifier: target for identifier, target in zip(instance_ids, continuous_targets.tolist())}

In [25]:
discrete_target_dict

{'cell_line_ACH-000070-cancer_type_ALL-pubchem_176870': 0,
 'cell_line_ACH-000137-cancer_type_GBM-pubchem_176870': 0,
 'cell_line_ACH-000008-cancer_type_SKCM-pubchem_176870': 0,
 'cell_line_ACH-000740-cancer_type_HNSC-pubchem_176870': 1,
 'cell_line_ACH-000697-cancer_type_DLBC-pubchem_176870': 0,
 'cell_line_ACH-000157-cancer_type_DLBC-pubchem_176870': 0,
 'cell_line_ACH-000105-cancer_type_ALL-pubchem_176870': 0,
 'cell_line_ACH-000269-cancer_type_GBM-pubchem_176870': 0,
 'cell_line_ACH-000838-cancer_type_MM-pubchem_176870': 0,
 'cell_line_ACH-000245-cancer_type_DLBC-pubchem_176870': 0,
 'cell_line_ACH-000927-cancer_type_BRCA-pubchem_176870': 0,
 'cell_line_ACH-000432-cancer_type_LCML-pubchem_176870': 0,
 'cell_line_ACH-000009-cancer_type_COAD-READ-pubchem_176870': 0,
 'cell_line_ACH-000440-cancer_type_DLBC-pubchem_176870': 0,
 'cell_line_ACH-000264-cancer_type_LUAD-pubchem_176870': 0,
 'cell_line_ACH-000464-cancer_type_GBM-pubchem_176870': 0,
 'cell_line_ACH-000136-cancer_type_NB-pubc

In [26]:
continuous_target_dict

{'cell_line_ACH-000070-cancer_type_ALL-pubchem_176870': 0.6933050155639648,
 'cell_line_ACH-000137-cancer_type_GBM-pubchem_176870': 2.580267906188965,
 'cell_line_ACH-000008-cancer_type_SKCM-pubchem_176870': 2.5578370094299316,
 'cell_line_ACH-000740-cancer_type_HNSC-pubchem_176870': 0.29001298546791077,
 'cell_line_ACH-000697-cancer_type_DLBC-pubchem_176870': 1.1102499961853027,
 'cell_line_ACH-000157-cancer_type_DLBC-pubchem_176870': 2.772212028503418,
 'cell_line_ACH-000105-cancer_type_ALL-pubchem_176870': 1.7383259534835815,
 'cell_line_ACH-000269-cancer_type_GBM-pubchem_176870': 3.5106239318847656,
 'cell_line_ACH-000838-cancer_type_MM-pubchem_176870': 2.784917116165161,
 'cell_line_ACH-000245-cancer_type_DLBC-pubchem_176870': 1.1635769605636597,
 'cell_line_ACH-000927-cancer_type_BRCA-pubchem_176870': 3.10622501373291,
 'cell_line_ACH-000432-cancer_type_LCML-pubchem_176870': 1.4355130195617676,
 'cell_line_ACH-000009-cancer_type_COAD-READ-pubchem_176870': 1.7292859554290771,
 'ce

In [27]:
dest_dir = os.path.join("..", "data", "cleaned")

label_dir = os.path.join(dest_dir, "targets")
os.makedirs(label_dir, exist_ok=True)

with open(os.path.join(label_dir, "continuous-labels.json"), "w") as f:
    json.dump(continuous_target_dict, f, indent=4)

In [28]:
dest_dir = os.path.join("..", "data", "cleaned")

label_dir = os.path.join(dest_dir, "targets")
os.makedirs(label_dir, exist_ok=True)

with open(os.path.join(label_dir, "discrete-labels.json"), "w") as f:
    json.dump(discrete_target_dict, f, indent=4)

In [29]:
dest_dir = os.path.join("..", "data", "cleaned", "instances")
os.makedirs(dest_dir, exist_ok=True)

for idx in tqdm(range(len(instance_ids)), "Saving Features"):
    instance_id = instance_ids[idx]
    instance_dir = os.path.join(dest_dir, instance_id)
    os.makedirs(instance_dir, exist_ok=True)

    drug_feature = drug_data[idx]["feature"]
    drug_adj_mat = drug_data[idx]["adj_mat"]

    mutation_feature = mutation_data[idx]
    gene_expression_feature = gene_expression_data[idx]
    methylation_feature = methylation_data[idx]

    np.save(os.path.join(instance_dir, "drug-feature.npy"), drug_feature)
    np.save(os.path.join(instance_dir, "drug-adjacency-matrix.npy"), drug_adj_mat)
    np.save(os.path.join(instance_dir, "mutation-feature.npy"), mutation_feature)
    np.save(os.path.join(instance_dir, "gene-expression.npy"), gene_expression_feature)
    np.save(os.path.join(instance_dir, "methylation-feature.npy"), methylation_feature)

Saving Features: 100%|██████████| 107446/107446 [01:43<00:00, 1036.75it/s]


In [30]:
os.listdir(dest_dir)

['cell_line_ACH-000129-cancer_type_SCLC-pubchem_9938202',
 'cell_line_ACH-000956-cancer_type_PRAD-pubchem_10384072',
 'cell_line_ACH-000176-cancer_type_LUSC-pubchem_5330286',
 'cell_line_ACH-000416-cancer_type_LUAD-pubchem_216326',
 'cell_line_ACH-000295-cancer_type_LCML-pubchem_6753378',
 'cell_line_ACH-000838-cancer_type_MM-pubchem_53302361',
 'cell_line_ACH-000664-cancer_type_NA-pubchem_10390396',
 'cell_line_ACH-000364-cancer_type_SARC-pubchem_53302361',
 'cell_line_ACH-000617-cancer_type_OV-pubchem_49806720',
 'cell_line_ACH-000267-cancer_type_NA-pubchem_5327091',
 'cell_line_ACH-000273-cancer_type_LGG-pubchem_10341154',
 'cell_line_ACH-000242-cancer_type_BLCA-pubchem_3218',
 'cell_line_ACH-000191-cancer_type_THCA-pubchem_2733526',
 'cell_line_ACH-000766-cancer_type_LUAD-pubchem_10461815',
 'cell_line_ACH-000835-cancer_type_SARC-pubchem_16720766',
 'cell_line_ACH-000648-cancer_type_MESO-pubchem_10077147',
 'cell_line_ACH-000242-cancer_type_BLCA-pubchem_9910224',
 'cell_line_ACH-00

In [31]:
all([i in continuous_target_dict.keys() for i in os.listdir(dest_dir)])

True

In [32]:
[i for i in os.listdir(dest_dir) if i not in continuous_target_dict.keys()]

[]

In [33]:
"COAD/READ".replace("/", "-")

'COAD-READ'

In [34]:
drug_adj_mat

array([[0.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0.5, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0.5, ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 1. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ]],
      shape=(100, 100), dtype=float32)

In [35]:
adj_list

[[np.int64(17)],
 [np.int64(19)],
 [np.int64(22)],
 [np.int64(23)],
 [np.int64(25)],
 [np.int64(15)],
 [np.int64(17), np.int64(7)],
 [np.int64(6), np.int64(18)],
 [np.int64(20), np.int64(21)],
 [np.int64(21), np.int64(10)],
 [np.int64(9), np.int64(22)],
 [np.int64(22), np.int64(23)],
 [np.int64(25), np.int64(13)],
 [np.int64(12), np.int64(14)],
 [np.int64(13), np.int64(15)],
 [np.int64(14), np.int64(5)],
 [np.int64(24), np.int64(26)],
 [np.int64(0), np.int64(6), np.int64(26)],
 [np.int64(7), np.int64(19), np.int64(26)],
 [np.int64(18), np.int64(1), np.int64(20)],
 [np.int64(19), np.int64(8), np.int64(24)],
 [np.int64(8), np.int64(9), np.int64(23)],
 [np.int64(10), np.int64(2), np.int64(11)],
 [np.int64(11), np.int64(3), np.int64(21)],
 [np.int64(20), np.int64(25), np.int64(16)],
 [np.int64(24), np.int64(4), np.int64(12)],
 [np.int64(16), np.int64(17), np.int64(18)]]

In [25]:
adj_mat = np.zeros((max_atoms, max_atoms), dtype="float32")

for idx in range(len(adj_list)):
    nodes = adj_list[idx]

    for neighbor in nodes:
        adj_mat[idx, int(neighbor)] = 1

In [36]:
dense_to_sparse(torch.tensor(adj_mat))[0]

tensor([[ 0,  1,  2,  3,  4,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11,
         12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
         19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25,
         25, 26, 26, 26],
        [17, 19, 22, 23, 25, 15,  7, 17,  6, 18, 20, 21, 10, 21,  9, 22, 22, 23,
         13, 25, 12, 14, 13, 15,  5, 14, 24, 26,  0,  6, 26,  7, 19, 26,  1, 18,
         20,  8, 19, 24,  8,  9, 23,  2, 10, 11,  3, 11, 21, 16, 20, 25,  4, 12,
         24, 16, 17, 18]])

In [24]:
adj_list

[[np.int64(17)],
 [np.int64(19)],
 [np.int64(22)],
 [np.int64(23)],
 [np.int64(25)],
 [np.int64(15)],
 [np.int64(17), np.int64(7)],
 [np.int64(6), np.int64(18)],
 [np.int64(20), np.int64(21)],
 [np.int64(21), np.int64(10)],
 [np.int64(9), np.int64(22)],
 [np.int64(22), np.int64(23)],
 [np.int64(25), np.int64(13)],
 [np.int64(12), np.int64(14)],
 [np.int64(13), np.int64(15)],
 [np.int64(14), np.int64(5)],
 [np.int64(24), np.int64(26)],
 [np.int64(0), np.int64(6), np.int64(26)],
 [np.int64(7), np.int64(19), np.int64(26)],
 [np.int64(18), np.int64(1), np.int64(20)],
 [np.int64(19), np.int64(8), np.int64(24)],
 [np.int64(8), np.int64(9), np.int64(23)],
 [np.int64(10), np.int64(2), np.int64(11)],
 [np.int64(11), np.int64(3), np.int64(21)],
 [np.int64(20), np.int64(25), np.int64(16)],
 [np.int64(24), np.int64(4), np.int64(12)],
 [np.int64(16), np.int64(17), np.int64(18)]]

In [38]:
def calculate_graph_feat(feat_mat, adj_list):
    assert feat_mat.shape[0] == len(adj_list)
    feat = np.zeros((max_atoms, feat_mat.shape[-1]), dtype="float32")
    adj_mat = np.zeros((max_atoms, max_atoms), dtype="float32")

    for idx in range(len(adj_list)):
        nodes = adj_list[idx]

        for neighbor in nodes:
            adj_mat[idx, int(neighbor)] = 1

    assert np.allclose(adj_mat, adj_mat.T)
    adj_main = adj_mat[:len(adj_list), :len(adj_list)]
    adj_complement = adj_mat[len(adj_list):, len(adj_list):]

    print(adj_main.shape)
    print(adj_complement.shape)

    norm_adj_main = normalize_adj(adj_main)
    norm_adj_complement = normalize_adj(adj_complement)

    adj_mat[:len(adj_list), :len(adj_list)] = norm_adj_main
    adj_mat[len(adj_list):, len(adj_list):] = norm_adj_complement
    graph_features = {"feature": feat, "adj_mat": adj_mat}
    
    return graph_features

In [39]:
calculate_graph_feat(feat_mat, adj_list)

(27, 27)
(73, 73)


{'feature': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], shape=(100, 75), dtype=float32),
 'adj_mat': array([[0.5, 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0.5, 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0.5, ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 1. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 1. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 1. ]],
       shape=(100, 100), dtype=float32)}

In [40]:
feat_mat

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], shape=(27, 75))