In [19]:
ls

README.MD
[1m[36m__pycache__[m[m/
create_data.ipynb
[1m[36mdata[m[m/
geometric library-20240618T035250Z-001.zip
[1m[36mgeometric_library[m[m/
install.py
[31minstall.sh[m[m*
mut_dict
requirements.txt
split_process_data.ipynb
utils.py


In [20]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

In [21]:
import pickle
def read_pkl(path):
  with open(path, "rb") as f:
    return pickle.load(f)
def save_pkl(path, obj):
  with open(path, "wb") as f:
    return pickle.dump(obj)

In [22]:
import pandas as pd

In [23]:
import os
import csv
from pubchempy import *
import numpy as np
import numbers
import h5py
import math
import pandas as pd
import json,pickle
from collections import OrderedDict
import rdkit
from rdkit import Chem
from rdkit.Chem import MolFromSmiles
import networkx as nx
from utils import *
import random
import pickle
import sys
import matplotlib.pyplot as plt
import argparse
import pickle
from tqdm.notebook import tqdm
import pandas as pd
from ipywidgets import IntProgress

In [24]:
def is_not_float(string_list):
    try:
        for string in string_list:
            float(string)
        return False
    except:
        return True

"""
The following 4 function is used to preprocess the drug data. We download the drug list manually, and download the SMILES format using pubchempy. Since this part is time consuming, I write the cids and SMILES into a csv file. 
"""
def load_drug_list():
    filename = "data/gdsc/Druglist.csv"
    csvfile = open(filename, "rb")
    reader = csv.reader(csvfile)
    next(reader, None)
    drugs = []
    for line in reader:
        drugs.append(line[0])
    drugs = list(set(drugs))
    return drugs
def atom_features(atom):
    return np.array(one_of_k_encoding_unk(atom.GetSymbol(),['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na','Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb','Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H','Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr','Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
                    one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
                    one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
                    one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
                    [atom.GetIsAromatic()])

def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
    return list(map(lambda s: x == s, allowable_set))

def one_of_k_encoding_unk(x, allowable_set):
    """Maps inputs not in the allowable set to the last element."""
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))

def smile_to_graph(smile):
    mol = Chem.MolFromSmiles(smile)
    
    c_size = mol.GetNumAtoms()
    
    features = []
    for atom in mol.GetAtoms():
        feature = atom_features(atom)
        features.append( feature / sum(feature) )

    edges = []
    for bond in mol.GetBonds():
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    g = nx.Graph(edges).to_directed()
    edge_index = []
    for e1, e2 in g.edges:
        edge_index.append([e1, e2])
        
    return c_size, features, edge_index

def load_drug_smile():
    reader = csv.reader(open("data/smiles.csv"))
    next(reader, None)

    drug_dict = {}
    drug_smile = []

    for item in reader:
        name = item[0]
        smile = item[1]

        if name in drug_dict:
            pos = drug_dict[name]
        else:
            pos = len(drug_dict)
            drug_dict[name] = pos
        drug_smile.append(smile)
    
    smile_graph = {}
    for smile in drug_smile:
        g = smile_to_graph(smile)
        smile_graph[smile] = g
    
    return drug_dict, drug_smile, smile_graph
def save_cell_oge_matrix():
    m, n = ge.shape
    feature_names = list(ge.columns[1:])
    cell_dict = {}
    cell_names = []
    for i in tqdm(range(m)):
      cell_name = ge.iloc[i, 0]
      cell_names.append(cell_name)
      cell_feature = np.asarray(ge.iloc[i, 1:], dtype=float)
      cell_dict[cell_name] = cell_feature
    return cell_dict

def save_cell_meth_matrix():
    f = open("data/gdsc/METH_CELLLINES_BEMs_PANCAN.csv")
    reader = csv.reader(f)
    firstRow = next(reader)
    numberCol = len(firstRow) - 1
    features = {}
    cell_dict = {}
    matrix_list = []
    for item in reader:
        cell_id = int(item[0])
        meth = []
        for i in range(1, len(item)):
            meth.append(int(item[i]))
        cell_dict[cell_id] = np.asarray(meth)
    return cell_dict

"""
This part is used to read PANCANCER Gene Expression Cell line features
"""

def save_cell_mut_matrix():
    f = open("data/gdsc/PANCANCER_Genetic_feature.csv")
    reader = csv.reader(f)
    next(reader)
    features = {}
    cell_dict = {}
    mut_dict = {}
    matrix_list = []

    for item in reader:
        cell_id = int(item[1])
        mut = item[5]
        is_mutated = int(item[6])

        if mut in mut_dict:
            col = mut_dict[mut]
        else:
            col = len(mut_dict)
            mut_dict[mut] = col

        if cell_id in cell_dict:
            row = cell_dict[cell_id]
        else:
            row = len(cell_dict)
            cell_dict[cell_id] = row
        if is_mutated == 1:
            matrix_list.append((row, col))
    
    cell_feature = np.zeros((len(cell_dict), len(mut_dict)))

    for item in matrix_list:
        cell_feature[item[0], item[1]] = 1

    with open('mut_dict', 'wb') as fp:
        pickle.dump(mut_dict, fp)
    
    return cell_dict, cell_feature

"""
This part is used to read PANCANCER Meth Cell line features
"""


cell_dict_mut, cell_feature_mut = save_cell_mut_matrix()
cell_dict_meth = save_cell_meth_matrix()
drug_dict, drug_smile, smile_graph = load_drug_smile()

In [25]:
drug_pt_df = pd.read_csv("data/drug_physic_toxic_df.csv")
drug_pt_df = drug_pt_df.set_index("Drug")

In [26]:
drug_pt_df.loc["5-FU"]

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
3073    0.000000
3074    0.000000
3075    0.007499
3076    0.086820
3077    0.000000
Name: 5-FU, Length: 3078, dtype: float64

In [27]:
import os

In [28]:
os.listdir("data/split_data/")

[]

In [29]:
data_names = [
 'mix_test',
]

In [30]:
def subype_to_array(subtype):
    subtypes = ['aero_digestive_tract', 'blood', 'bone', 'breast',
       'digestive_system', 'kidney', 'lung', 'nervous_system', 'pancreas',
       'skin', 'soft_tissue', 'thyroid', 'urogenital_system']
    temp = np.zeros(len(subtypes))
    temp[subtypes.index(subtype)] = 1
    return temp

In [31]:
def create_data(data_name, root):
    global_path = data_split_path
    f = open(global_path + data_name+".csv")
    reader = csv.reader(f)
    next(reader)    
    temp_data = []
    for item in reader:
        drug_1 = str(item[0])
        drug_2 = str(item[1])
        cell = int(item[3])
        loewe = float(item[4])
        temp_data.append((drug_1, drug_2, cell, loewe))
    xd_1 = []
    xd_pt_1 = []
    xd_2 = []
    xd_pt_2 = []
    xc_mut = []
    xc_meth = []
    xc_ge = []    
    y = []
    lst_drug = []
    lst_cell = []
#     random.shuffle(temp_data)
    for data in temp_data:
        drug_1, drug_2, cell, loewe = data
        if drug_1 in drug_dict and drug_2 in drug_dict and cell in cell_dict_ge:
            xc_mut.append(cell_feature_mut[cell_dict_mut[cell]])
            xc_meth.append(cell_dict_meth[cell])
            xc_ge.append(cell_dict_ge[cell])
            xd_1.append(drug_smile[drug_dict[drug_1]])
            xd_pt_1.append(drug_pt_df.loc[drug_1])
            xd_2.append(drug_smile[drug_dict[drug_2]])
            xd_pt_2.append(drug_pt_df.loc[drug_2])
            y.append(loewe)
            lst_drug.append((drug_1, drug_2))
            lst_cell.append(cell)
    with open(data_split_path + 'temp/drug_dict_'+data_name, 'wb') as fp:
        pickle.dump(drug_dict, fp)
        
    print(len(lst_drug))
    print(len(lst_cell))

    xd_1, xd_pt_1, xd_2, xd_pt_2, xc_mut, xc_meth, xc_ge, y = np.asarray(xd_1), np.asarray(xd_pt_1), np.asarray(xd_2), np.asarray(xd_pt_2), np.asarray(xc_mut), np.asarray(xc_meth), np.asarray(xc_ge), np.asarray(y)

    with open(data_split_path + 'temp/list_drug_'+data_name, 'wb') as fp:
        pickle.dump(lst_drug, fp)
        
    with open(data_split_path + 'temp/list_cell_'+data_name, 'wb') as fp:
        pickle.dump(lst_cell, fp)

    dataset = 'GDSC'
    print('preparing ', dataset + '_train.pt in pytorch format!')
    data = TestbedDataset(root=root, dataset=dataset+"_"+data_name, xd_1=xd_1, xd_pt_1=xd_pt_1, xd_2=xd_2, xd_pt_2=xd_pt_2, xt_mut=xc_mut, xt_meth=xc_meth, xt_ge=xc_ge, y=y, smile_graph=smile_graph)

count = 1


In [1]:
data_split_path_temp = "data/split_data/"
root_temp = "data/split_data/"
data_sets = ["all_test/"]
for data_set in data_sets:
    print(f"_________________________{data_set}_____________________________")
    count = 1
    data_split_path = data_split_path_temp+data_set
    root = root_temp + data_set
    ge = pd.read_csv(data_split_path+"ge_process.csv")
    ge = ge.rename(columns={"Unnamed: 0":"Cosmic sample Id"})
    cell_dict_ge = save_cell_oge_matrix()
    for data_name in tqdm(data_names):
      print(count, ":", data_name)
      create_data(data_name, root)
      count += 1