In [1]:
from rdkit import Chem
import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Data
from dgllife.utils import *

In [2]:
def atom_to_feature_vector(atom):
    """
    Converts rdkit atom object to feature list of indices
    :param mol: rdkit atom object
    :return: list
    8 features are canonical, 2 features are from OGB
    """
    featurizer_funcs = ConcatFeaturizer([atom_type_one_hot,
                                         atom_degree_one_hot,
                                         atom_implicit_valence_one_hot,
                                         atom_formal_charge,
                                         atom_num_radical_electrons,
                                         atom_hybridization_one_hot,
                                         atom_is_aromatic,
                                         atom_total_num_H_one_hot,
                                         atom_is_in_ring,
                                         atom_chirality_type_one_hot,
                                         ])
    atom_feature = featurizer_funcs(atom)
    return atom_feature

In [3]:
def bond_to_feature_vector(bond):
    """
    Converts rdkit bond object to feature list of indices
    :param mol: rdkit bond object
    :return: list
    """
    featurizer_funcs = ConcatFeaturizer([bond_type_one_hot,
                                         # bond_is_conjugated,
                                         # bond_is_in_ring,
                                         # bond_stereo_one_hot,
                                         ])
    bond_feature = featurizer_funcs(bond)

    return bond_feature

In [4]:
def smiles2graph(mol):
    """
    Converts SMILES string or rdkit's mol object to graph Data object without remove salt
    :input: SMILES string (str)
    :return: graph object
    """

    if isinstance(mol, Chem.rdchem.Mol):
        pass
    else:
        mol = Chem.MolFromSmiles(mol)

    # atoms
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_to_feature_vector(atom))
    x = np.array(atom_features_list, dtype=np.int64)

    # bonds
    num_bond_features = 4  # bond type, bond stereo, is_conjugated
    if len(mol.GetBonds()) > 0:  # mol has bonds
        edges_list = []
        edge_features_list = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()

            edge_feature = bond_to_feature_vector(bond)

            # add edges in both directions
            edges_list.append((i, j))
            edge_features_list.append(edge_feature)
            edges_list.append((j, i))
            edge_features_list.append(edge_feature)

        # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
        edge_index = np.array(edges_list, dtype=np.int64).T

        # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
        edge_attr = np.array(edge_features_list, dtype=np.int64)

    else:  # mol has no bonds
        edge_index = np.empty((2, 0), dtype=np.int64)
        edge_attr = np.empty((0, num_bond_features), dtype=np.int64)

    graph = Data(x=torch.tensor(x, dtype=torch.float),
                 edge_index=torch.tensor(edge_index, dtype=torch.long),
                 edge_attr=torch.tensor(edge_attr), dtype=torch.float)

    return graph

In [5]:
smiles = pd.read_csv("./Drug_smiles.csv")

In [6]:
smiles

Unnamed: 0.1,Unnamed: 0,Drug,SMILES
0,0,(+)-Bicuculline,CN1CCC2=CC3=C(C=C2C1C4C5=C(C6=C(C=C5)OCO6)C(=O...
1,1,(+)-PD 128907 hydrochloride,CCCN1CCOC2C1COC3=C2C=C(C=C3)O.Cl
2,2,(+)-Usnic acid,CC1=C(C(=C2C(=C1O)C3(C(=CC(=C(C3=O)C(=O)C)O)O2...
3,3,(-)-Blebbistatin,CC1=CC2=C(C=C1)N=C3C(C2=O)(CCN3C4=CC=CC=C4)O
4,4,(-)-Epigallocatechin gallate,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C(=C3)O)O)O...
...,...,...,...
3118,3118,IGF-OSI,C[C@@]1(O)C[C@@H](C1)c2nc(c3ccc4ccc(nc4c3)c5cc...
3119,3119,LSM-4270,CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)...
3120,3120,SCHEMBL1557791,CC1=NN(C=C1NC2=NC=C(C(=C2)NC3=CC=CC=C3C(=O)NC)...
3121,3121,SCHEMBL1559365,CC1=NN(C=C1NC2=NC=C(C(=C2)NC3=C(C=C(C=C3)F)C(=...


In [7]:
# 需要替换的特定值
replace_dict = {
    '3094/9/5': '3094-09-5',
    '6429/4/5': '6429-04-5',
    '7218/4/4': '7218-04-4'
}

# 仅替换指定的 Drug 值
smiles['Drug'] = smiles['Drug'].apply(lambda x: replace_dict[x] if x in replace_dict else x)

In [9]:
smiles.to_csv("./Drug_smiles.csv",index=False)

In [10]:
drug_map = np.load('./Drug_map.npy', allow_pickle=True)
drug_map = drug_map.item()

In [11]:
drug_map 

{'(+)-Bicuculline': 0,
 '(+)-PD 128907 hydrochloride': 1,
 '(+)-Usnic acid': 2,
 '(-)-Blebbistatin': 3,
 '(-)-Epigallocatechin gallate': 4,
 '(-)-Ethyl apovincaminate': 5,
 '(-)-MK 801 Maleate': 6,
 '(-)-Rapamycin': 7,
 '(-)-chlorpheniramine': 8,
 '(-)-huperzine A': 9,
 '(3S,4R)-Tofacitinib': 10,
 '(E/Z)-ferulic acid': 11,
 '(Phenylindolyl)maleimide deriv. 79': 12,
 '(R)-(-)-Rolipram': 13,
 '(R)-Bicalutamide': 14,
 '(S)-(+)-rolipram': 15,
 '.beta.,.beta.-Carotene': 16,
 '001, RAD': 17,
 '1,25-dihydroxy vitamin D3': 18,
 '1-(1-Naphthylmethyl)piperazine': 19,
 '1-DOCOSANOL': 20,
 '1-Hexadecanol': 21,
 '1-Methyl-D-tryptophan': 22,
 '1-beta-D-Xylofuranosyluracil': 23,
 '10-Hydroxycamptothecin': 24,
 '10-epi-Aclacinomycin A': 25,
 '1001645-58-4': 26,
 '10030-85-0': 27,
 '10058-F4': 28,
 '10074-G5': 29,
 '1032350-13-2': 30,
 '10356-76-0': 31,
 '10402-53-6': 32,
 '1047953-91-2': 33,
 '1062368-24-4': 34,
 '1072959-67-1': 35,
 '1080622-86-1': 36,
 '1095382-05-0': 37,
 '1096708-71-2': 38,
 '11-c

In [12]:
drug_list_from_dict = set(drug_map.keys())

In [13]:
drug_list_from_csv = set(smiles['Drug'].str.strip())

In [14]:
missing_in_dict = drug_list_from_csv - drug_list_from_dict

In [15]:
missing_in_dict

set()

In [16]:
drug_dict = {}
for key in drug_map:
    drug_dict[key] = smiles2graph(smiles.loc[smiles['Drug']==key,'SMILES'].values[0])

In [17]:
drug_dict

{'(+)-Bicuculline': Data(x=[27, 77], edge_index=[2, 64], edge_attr=[64, 4], dtype=torch.float32),
 '(+)-PD 128907 hydrochloride': Data(x=[19, 77], edge_index=[2, 40], edge_attr=[40, 4], dtype=torch.float32),
 '(+)-Usnic acid': Data(x=[25, 77], edge_index=[2, 54], edge_attr=[54, 4], dtype=torch.float32),
 '(-)-Blebbistatin': Data(x=[22, 77], edge_index=[2, 50], edge_attr=[50, 4], dtype=torch.float32),
 '(-)-Epigallocatechin gallate': Data(x=[33, 77], edge_index=[2, 72], edge_attr=[72, 4], dtype=torch.float32),
 '(-)-Ethyl apovincaminate': Data(x=[26, 77], edge_index=[2, 60], edge_attr=[60, 4], dtype=torch.float32),
 '(-)-MK 801 Maleate': Data(x=[25, 77], edge_index=[2, 54], edge_attr=[54, 4], dtype=torch.float32),
 '(-)-Rapamycin': Data(x=[65, 77], edge_index=[2, 136], edge_attr=[136, 4], dtype=torch.float32),
 '(-)-chlorpheniramine': Data(x=[19, 77], edge_index=[2, 40], edge_attr=[40, 4], dtype=torch.float32),
 '(-)-huperzine A': Data(x=[18, 77], edge_index=[2, 40], edge_attr=[40, 4], 

In [18]:
np.save('./drug_feature_graph.npy', drug_dict)