In [149]:
import copy
import tdc
from model_classes import *

import torch
import numpy as np
import pandas as pd
import time
import pickle
import argparse
import rdkit
from rdkit import Chem
from rdkit.Chem import MACCSkeys

import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import DataLoader


In [150]:
# Load Data
group = tdc.BenchmarkGroup('drugcombo_group', path='data/',
                               file_format='pkl')
name = 'drugcomb_css'
train_val = group.get(name)['train_val']
test = group.get(name)['test']
train_val.head()
All_Drugs = np.unique(np.concatenate((train_val['Drug1'].unique(), train_val['Drug2'].unique(), test['Drug1'].unique(), test['Drug2'].unique())))


tdc.BenchmarkGroup will be deprecated soon. Please use tdc.benchmark_group.XXX_group and check out the examples on website!
Found local copy...


In [151]:
# train_val.to_csv('/Users/derrick/Desktop/CSC413/train_val.csv')
# test.to_csv('/Users/derrick/Desktop/CSC413/test.csv')

In [152]:
def one_hot_encoding(x, l):
    if x not in l:
        x = l[-1]
    return list(map(lambda s: x == s, l))
element_list = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na','Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb','Sb', 'Sn', 
                'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H','Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr','Cr', 'Pt', 'Hg', 'Pb', 'Other']
degree_list = [0, 1, 2, 3, 4, 5, 6,7,8,9,10]
Number_of_H_list = [0, 1, 2, 3, 4, 5, 6,7,8,9,10]
ImplicitValence_list = [0, 1, 2, 3, 4, 5, 6,7,8,9,10]

def smile_to_graph(smile):
    molecule = Chem.MolFromSmiles(smile)
    atom_features = []
    for atom in molecule.GetAtoms():
        feature = (one_hot_encoding(atom.GetSymbol(), element_list) + one_hot_encoding(atom.GetDegree(), degree_list) +
                   one_hot_encoding(atom.GetTotalNumHs(), Number_of_H_list) + one_hot_encoding(atom.GetImplicitValence(), ImplicitValence_list))
        atom_features.append(feature)
    atom_features = np.array(atom_features, dtype=int)
    
    edges = []
    for bond in molecule.GetBonds():
        start_atom_idx = bond.GetBeginAtomIdx()
        end_atom_idx = bond.GetEndAtomIdx()
        edges.append([start_atom_idx, end_atom_idx])
    edges = np.array(edges, dtype=np.int64)

    return atom_features, edges

def Add_Graph(test2):
    smile_to_graph_dict = {}
    for drug in All_Drugs:
        smile_to_graph_dict[drug] = smile_to_graph(drug)
    features_list1 = []
    edges_list1 = []
    for drug_smiles in test2['Drug1']:
        features, edges = smile_to_graph_dict.get(drug_smiles, (None, None))
        features_list1.append(features)
        edges_list1.append(edges)
        
    features_list2 = []
    edges_list2 = []
    for drug_smiles in test2['Drug2']:
        features, edges = smile_to_graph_dict.get(drug_smiles, (None, None))
        features_list2.append(features)
        edges_list2.append(edges)
        
    test2['Drug1_Atom_Feature'], test2['Drug1_Atom_Edges'] = features_list1, edges_list1
    test2['Drug2_Atom_Feature'], test2['Drug2_Atom_Edges'] = features_list2, edges_list2

Add_Graph(train_val)
Add_Graph(test)


In [154]:
train_val.head()

Unnamed: 0,Drug1_ID,Drug2_ID,Cell_Line_ID,Drug1,Drug2,CellLine,Y,target_class,Drug1_Atom_Feature,Drug1_Atom_Edges,Drug2_Atom_Feature,Drug2_Atom_Edges
0,1-(5-DEOXYPENTOFURANOSYL)-5-FLUORO-4-{[(PENTYL...,ALLOPURINOL,786-0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)C2C(C(C(O2)C)O)O,C1=NNC2=C1C(=O)NC=N2,"[[0.0, 0.057, 2.564, 0.0, 0.0, 0.015, 0.464, 0...",-4.4265,kidney,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6..."
1,1-(5-DEOXYPENTOFURANOSYL)-5-FLUORO-4-{[(PENTYL...,AMIFOSTINE,786-0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)C2C(C(C(O2)C)O)O,C(CN)CNCCSP(=O)(O)O,"[[0.0, 0.057, 2.564, 0.0, 0.0, 0.015, 0.464, 0...",-0.1615,kidney,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [0, 3], [3, 4], [4, 5], [5, 6..."
2,1-(5-DEOXYPENTOFURANOSYL)-5-FLUORO-4-{[(PENTYL...,AMINOLEVULINIC ACID HYDROCHLORIDE,786-0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)C2C(C(C(O2)C)O)O,C(CC(=O)O)C(=O)CN.Cl,"[[0.0, 0.057, 2.564, 0.0, 0.0, 0.015, 0.464, 0...",13.5085,kidney,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [2, 4], [0, 5], [5, 6..."
3,1-(5-DEOXYPENTOFURANOSYL)-5-FLUORO-4-{[(PENTYL...,ANASTROZOLE,786-0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)C2C(C(C(O2)C)O)O,CC(C)(C#N)C1=CC(=CC(=C1)CN2C=NC=N2)C(C)(C)C#N,"[[0.0, 0.057, 2.564, 0.0, 0.0, 0.015, 0.464, 0...",-0.988,kidney,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [1, 3], [3, 4], [1, 5], [5, 6..."
4,1-(5-DEOXYPENTOFURANOSYL)-5-FLUORO-4-{[(PENTYL...,AZACYTIDINE,786-0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)C2C(C(C(O2)C)O)O,C1=NC(=NC(=O)N1C2C(C(C(O2)CO)O)O)N,"[[0.0, 0.057, 2.564, 0.0, 0.0, 0.015, 0.464, 0...",13.676,kidney,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [4, 6..."


In [153]:
# train_val.to_csv('/Users/derrick/Desktop/CSC413/train_val_Graph.csv')
# test.to_csv('/Users/derrick/Desktop/CSC413/test_Graph.csv')

In [85]:
# # 从Preprocess那个link搞出来的
# import networkx as nx

# smiles = "CCCOC(=O)NC"
# def one_of_k_encoding(x, allowable_set):
#     if x not in allowable_set:
#         raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
#     return list(map(lambda s: x == s, allowable_set))
# def one_of_k_encoding_unk(x, allowable_set):
#     """Maps inputs not in the allowable set to the last element."""
#     if x not in allowable_set:
#         x = allowable_set[-1]
#     return list(map(lambda s: x == s, allowable_set))

# def atom_features(atom):
#     return np.array(one_of_k_encoding_unk(atom.GetSymbol(),['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na','Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb','Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H','Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr','Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
#                     one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
#                     one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
#                     one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) +
#                     [atom.GetIsAromatic()])
# def smile_to_graph(smile):
#     mol = Chem.MolFromSmiles(smile)
    
#     c_size = mol.GetNumAtoms()
    
#     features = []
#     for atom in mol.GetAtoms():
#         feature = atom_features(atom)
#         features.append( feature / sum(feature) )

#     edges = []
#     for bond in mol.GetBonds():
#         edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
#     g = nx.Graph(edges).to_directed()
#     edge_index = []
#     for e1, e2 in g.edges:
#         edge_index.append([e1, e2])
        
#     return c_size, features, edge_index
# c_size, features, edge_index = smile_to_graph(smiles)