In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import torch
from rdkit import Chem
from torch_geometric import data as DATA

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.head()


Unnamed: 0,Drug,Protien,label
0,IC1=CNN=C1,MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGI...,1
1,CC(=O)NC1=CNN=C1C(=O)NC1=CC=C(F)C=C1,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPS...,1
2,CN(C)CCCN1C2=CC=CC=C2CCC2=C1C=C(Cl)C=C2,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,1
3,ClC1=CC=CC(N2CCN(CCCCOC3=CC4=C(CCC(=O)N4)C=C3)...,MANFTPVNGSSGNQSVRLVTSSSHNRYETVEMVFIATVTGSLSLVT...,1
4,OC1N=C(C2=CC=CC=C2Cl)C2=C(NC1=O)C=CC(Cl)=C2,MVSAKKVPAIALSAGVSFALLRFLCLAVCLNESPGQNQKEEKLCTE...,1


In [3]:
data = df.values
dataset = []


def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception(
            "input {0} not in allowable set{1}:".format(x, allowable_set))
    return list(map(lambda s: x == s, allowable_set))


def one_of_k_encoding_unk(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))


def atom_features(atom):
    return np.array(one_of_k_encoding_unk(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
                    one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
                    one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
                    one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
                    [atom.GetIsAromatic()])


seq_voc = "ABCDEFGHIKLMNOPQRSTUVWXYZ"
seq_dict = {v: (i+1) for i, v in enumerate(seq_voc)}
seq_dict_len = len(seq_dict)
max_seq_len = 1000


def seq_cat(prot):
    x = np.zeros(max_seq_len)
    for i, ch in enumerate(prot[:max_seq_len]):
        x[i] = seq_dict[ch]
    return x


def smile_to_graph(smile):
    mol = Chem.MolFromSmiles(smile)

    c_size = mol.GetNumAtoms()

    features = []
    for atom in mol.GetAtoms():
        feature = atom_features(atom)
        features.append(feature / sum(feature))

    edges = []
    for bond in mol.GetBonds():
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    g = nx.Graph(edges).to_directed()
    edge_index = []
    for e1, e2 in g.edges:
        edge_index.append([e1, e2])

    return c_size, features, edge_index


dataset_new = []
for i in range(data.shape[0]):
    try:
        drug_smile = data[i, 0]
        protien_seq = data[i, 1]
        label = data[i, 2]
        c_size, features, edge_index = smile_to_graph(drug_smile)
        GCN_DATA = DATA.Data(x=torch.tensor(features, dtype=torch.float), edge_index=torch.tensor(
            edge_index, dtype=torch.long).transpose(1, 0), y=torch.tensor([label], dtype=torch.long))
        target = seq_cat(protien_seq)
        GCN_DATA.target = torch.LongTensor([target])
        dataset_new.append(GCN_DATA)
    except:
        pass
torch.save(dataset_new, 'featureextracteddata.pt')


  GCN_DATA = DATA.Data(x=torch.tensor(features, dtype=torch.float), edge_index=torch.tensor(
  GCN_DATA.target = torch.LongTensor([target])
[14:51:04] Explicit valence for atom # 4 F, 2, is greater than permitted
[14:51:04] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:05] Explicit valence for atom # 4 F, 2, is greater than permitted
[14:51:10] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:10] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:12] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:18] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:19] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:19] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:22] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:22] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:51:22] Explicit valence for atom # 0 N, 4, is greater