In [1]:
import os
import re
import math
import random
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
import seaborn as sns
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import cloudpickle as pickle
### Pytorch
import torch
import torch.nn as nn
from fastai.tabular import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, average_precision_score, f1_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #last_expr

In [2]:
GLOBAL_SEED = 20210117
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(GLOBAL_SEED) 

# 1. import dataset (protein descriptors)

In [3]:
[data_raw, _, _, _, _, _] = pickle.load(open('NR.pkl', 'rb'))
cont_names = data_raw.columns
cat_names = []
dep_var = 'Activity'

In [4]:
seq = pickle.load(open('DBs/seq.pkl', 'rb'), encoding='bytes')
proteins = list(seq.keys())
seq_des = []
for p in proteins:
    row = []
    for i in data_raw.columns:
        if 'NR' in i:
            row.append(seq[p][i])
    seq_des.append(row)
seq_des = pd.DataFrame(seq_des, index=proteins, columns=[i for i in data_raw.columns if 'NR' in i])
seq_des['PID'] = list(seq_des.index)
seq_des.head()

Unnamed: 0,NR0B2,NR1A1,NR1B1,NR1C1,NR1C2,NR1C3,NR1D1,NR1D2,NR1F1,NR1F3,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
NR3B1,2.02,39.11,2.42,34.68,40.73,35.89,4.03,35.89,38.71,23.79,...,3.63,100.0,3.23,75.0,33.87,41.53,46.37,49.6,52.42,NR3B1
NR3B2,2.04,10.2,50.0,5.1,18.37,8.16,59.18,21.43,13.27,19.39,...,50.0,8.16,100.0,10.2,15.31,17.35,11.22,4.08,12.24,NR3B2
NR3B3,2.05,40.57,2.05,34.02,34.02,35.66,2.05,33.2,28.28,34.43,...,2.05,76.23,4.1,100.0,31.56,42.21,51.64,51.64,53.69,NR3B3
NR2B1,2.08,38.33,1.67,40.83,40.0,37.92,2.08,35.42,32.92,33.33,...,55.83,52.08,2.92,53.33,42.08,38.33,38.33,54.58,56.67,NR2B1
NR1F3,2.02,49.6,2.02,40.32,43.95,45.97,4.84,46.37,72.18,100.0,...,26.61,23.79,7.66,33.87,39.92,53.23,50.4,28.23,43.95,NR1F3


# 2. import model

In [5]:
_, clf = pickle.load(open('NR_RF_N5000_n501_f0.1_dNone.pkl', 'rb'))
Pro_in_model = ['NR1C1', 'NR1C2', 'NR1C3', 'NR1H2', 'NR2B1']
Pro_in_DB = ['NR1C1', 'NR1C2', 'NR1C3', 'NR1H2', 'NR2B1', 'NR1H3', 'NR1H4', 'NR2B3', 'NR2B2', 'NR1D1', 'NR1I2']

# 3. Import ligands from databases and test

## 3.1. long-chain_descriptors

In [8]:
ligands = pd.read_csv('DBs/long-chain_descriptors.tsv', index_col=0, sep='\t')
ligands
data_p = pd.DataFrame()
data_l = pd.DataFrame()
for i in ligands.index:
    for j in Pro_in_DB:
        data_l = data_l.append(ligands.loc[i, :])
        data_p = data_p.append(seq_des.loc[j, :])

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,PLM,PLM,,10.300651,-0.654562,10.300651,0.345,2.922966,177.935774,13.476845,...,0,0,0,0,0,0,0,0,11,0
1,HXA,HXA,,10.31572,-0.741177,10.31572,0.210483,3.51142,462.361587,17.719486,...,0,0,0,0,0,0,0,0,0,0
2,9CR,9CR,,10.534005,-0.912324,10.534005,0.259688,2.791585,566.514664,16.750712,...,0,0,0,0,0,0,0,0,0,0
3,REA,REA,,10.534005,-0.912324,10.534005,0.259688,2.791585,566.514664,16.750712,...,0,0,0,0,0,0,0,0,0,0
4,J57,J57,,10.577897,-0.994046,10.577897,0.183686,2.451787,684.612836,15.405413,...,0,0,0,0,0,0,0,0,0,0
5,7O0,7O0,,10.676526,-1.000478,10.676526,0.170682,1.922366,911.565275,17.648054,...,0,0,0,0,0,0,0,0,0,0
6,4XW,4XW,,10.670428,-0.664703,10.670428,0.312164,2.264586,369.903534,16.750712,...,0,0,0,0,0,0,0,0,0,0
7,754,754,,10.780865,-0.928114,10.780865,0.138713,2.530959,850.94957,21.819626,...,0,0,0,0,0,0,0,0,0,0


In [9]:
data_p.index = range(data_p.shape[0])
data_l.index = range(data_l.shape[0])
data = pd.concat([data_p, data_l], axis=1, sort=False)
data.head(2)

Unnamed: 0,NR0B2,NR1A1,NR1B1,NR1C1,NR1C2,NR1C3,NR1D1,NR1D2,NR1F1,NR1F3,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,2.56,34.8,4.76,100.0,85.71,79.49,4.4,35.9,39.19,36.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0
1,2.55,37.82,4.73,85.09,100.0,79.27,4.36,38.91,47.27,39.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0


In [15]:
probs = clf.predict_proba(data.loc[:, cont_names])[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = 1-probs
result.head(2)
result.to_csv('DB_result/long-chain_prob.csv')

Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,PLM,PLM,0.343812
1,NR1C2,PLM,PLM,0.197771


In [14]:
result.loc[result['PID']=='NR2B1', :]

Unnamed: 0,PID,ID,NAME,Prob
4,NR2B1,PLM,PLM,0.392382
15,NR2B1,HXA,HXA,0.844311
26,NR2B1,9CR,9CR,0.473054
37,NR2B1,REA,REA,0.473054
48,NR2B1,J57,J57,0.152695
59,NR2B1,7O0,7O0,0.355598
70,NR2B1,4XW,4XW,0.710579
81,NR2B1,754,754,0.964072


## 3.2. approved_drug_descriptors

In [46]:
ligands = pd.read_csv('DBs/approved_drug_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,DB00006,Bivalirudin,128270-60-0,14.987859,-2.151153,14.987859,0.020043,1.052628,5529.131046,114.992591,...,0,0,0,0,0,0,0,0,1,0
1,DB00014,Goserelin,65807-02-5,14.781343,-1.801238,14.781343,0.004531,1.242726,3235.805624,66.527429,...,0,0,0,0,0,0,0,0,1,1


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,DB00006,Bivalirudin,128270-60-0,14.987859,-2.151153,14.987859,0.020043,1.052628,5529.131046,114.992591,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,DB00006,Bivalirudin,128270-60-0,14.987859,-2.151153,14.987859,0.020043,1.052628,5529.131046,114.992591,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


In [50]:
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/approved_drug_prob.csv')

Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,DB00006,Bivalirudin,0.497904
1,NR1C2,DB00006,Bivalirudin,0.55489


In [57]:
ligands = pd.read_csv('DBs/approved_drug_scaffold_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/approved_drug_scaffold_prob.csv')

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,DB00006,Bivalirudin,128270-60-0,13.320973,-1.279664,13.320973,0.059982,0.771589,3532.207189,74.470894,...,0,0,0,0,0,0,0,0,0,0
1,DB00014,Goserelin,65807-02-5,13.984756,-1.262565,13.984756,0.021724,1.071721,2310.43652,45.425499,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,DB00006,Bivalirudin,128270-60-0,13.320973,-1.279664,13.320973,0.059982,0.771589,3532.207189,74.470894,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,DB00006,Bivalirudin,128270-60-0,13.320973,-1.279664,13.320973,0.059982,0.771589,3532.207189,74.470894,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,DB00006,Bivalirudin,0.535662
1,NR1C2,DB00006,Bivalirudin,0.605123


## 3.3. experimental_drug_descriptors

In [51]:
ligands = pd.read_csv('DBs/experimental_drug_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,DB00466,Picrotoxin,124-87-8,12.405295,-1.470741,12.405295,0.188657,1.210974e-07,1500.406312,30.817108,...,0,0,0,0,0,0,0,0,0,0
1,DB00616,Candoxatril,123122-55-4,13.631175,-0.754409,13.631175,0.017595,1.426891,947.573584,26.252502,...,0,0,0,0,0,0,0,0,2,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,DB00466,Picrotoxin,124-87-8,12.405295,-1.470741,12.405295,0.188657,1.210974e-07,1500.406312,30.817108,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,DB00466,Picrotoxin,124-87-8,12.405295,-1.470741,12.405295,0.188657,1.210974e-07,1500.406312,30.817108,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


In [52]:
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/experimental_drug_prob.csv')

Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,DB00466,Picrotoxin,0.533932
1,NR1C2,DB00466,Picrotoxin,0.547904


In [58]:
ligands = pd.read_csv('DBs/experimental_drug_scaffold_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/experimental_drug_scaffold_prob.csv')

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,DB00466,Picrotoxin,124-87-8,11.831945,-0.680324,11.831945,0.00662,1e-07,969.311752,21.308672,...,0,0,0,0,0,0,0,0,0,0
1,DB00616,Candoxatril,123122-55-4,13.07106,-0.366665,13.07106,0.182621,1.324071,714.560488,19.39913,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,DB00466,Picrotoxin,124-87-8,11.831945,-0.680324,11.831945,0.00662,1e-07,969.311752,21.308672,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,DB00466,Picrotoxin,124-87-8,11.831945,-0.680324,11.831945,0.00662,1e-07,969.311752,21.308672,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,DB00466,Picrotoxin,0.581836
1,NR1C2,DB00466,Picrotoxin,0.601464


## 3.4. chembl_random_descriptors

In [53]:
ligands = pd.read_csv('DBs/chembl_random_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL1783156,CHEMBL1783156,,9.761811,0.118821,9.761811,0.118821,1.873009,834.567508,15.104084,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL1093006,CHEMBL1093006,,14.430057,-0.697576,14.430057,0.014032,1.998322,857.91273,22.802754,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,CHEMBL1783156,CHEMBL1783156,,9.761811,0.118821,9.761811,0.118821,1.873009,834.567508,15.104084,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,CHEMBL1783156,CHEMBL1783156,,9.761811,0.118821,9.761811,0.118821,1.873009,834.567508,15.104084,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


In [54]:
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/chembl_random_prob.csv')

Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,CHEMBL1783156,CHEMBL1783156,0.706587
1,NR1C2,CHEMBL1783156,CHEMBL1783156,0.714571


In [59]:
ligands = pd.read_csv('DBs/chembl_random_scaffold_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/chembl_random_scaffold_prob.csv')

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL1783156,CHEMBL1783156,,3.621288,0.289213,3.621288,0.289213,1.911658,712.465086,12.65649,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL1093006,CHEMBL1093006,,11.57047,-0.097422,11.57047,0.054368,1.650956,496.672087,13.338653,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,CHEMBL1783156,CHEMBL1783156,,3.621288,0.289213,3.621288,0.289213,1.911658,712.465086,12.65649,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,CHEMBL1783156,CHEMBL1783156,,3.621288,0.289213,3.621288,0.289213,1.911658,712.465086,12.65649,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,CHEMBL1783156,CHEMBL1783156,0.655546
1,NR1C2,CHEMBL1783156,CHEMBL1783156,0.691617


## 3.5. TCM_Taiwan_descriptors

In [55]:
ligands = pd.read_csv('DBs/TCM_Taiwan_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,ZINC85532298,ZINC85532298,,12.261224,-0.746971,12.261224,0.056266,1.464288,966.576586,24.689505,...,0,0,0,0,0,0,0,0,0,0
1,ZINC86050507,ZINC86050507,,13.479324,-1.506164,13.479324,0.059785,1.951358,1746.049997,30.463373,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,ZINC85532298,ZINC85532298,,12.261224,-0.746971,12.261224,0.056266,1.464288,966.576586,24.689505,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,ZINC85532298,ZINC85532298,,12.261224,-0.746971,12.261224,0.056266,1.464288,966.576586,24.689505,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


In [56]:
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/TCM_Taiwan_prob.csv')

Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,ZINC85532298,ZINC85532298,0.596008
1,NR1C2,ZINC85532298,ZINC85532298,0.666001


In [60]:
ligands = pd.read_csv('DBs/TCM_Taiwan_scaffold_descriptors.tsv', index_col=0, sep='\t')
ligands.head(2)
data = []
for i in ligands.index:
    for j in Pro_in_DB:
        row = list(ligands.loc[i, :])
        row.extend(list(seq_des.loc[j, :]))
        data.append(row)
columns = list(ligands.columns)
columns.extend(seq_des.columns)
data = pd.DataFrame(data, columns=columns)
data = data.replace([-np.inf, np.inf], np.nan)
data = data.dropna(subset=cont_names, how='any')
data.head(2)
probs = clf.predict_proba(data.loc[:, cont_names].values.clip(-10e5, 10e5))[:, 1]
result = data.loc[:, ['PID', 'ID', 'NAME']]
result['Prob'] = probs
result.head(2)
result.to_csv('DB_result/TCM_Taiwan_scaffold_prob.csv')

Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,ZINC85532298,ZINC85532298,,11.416821,-0.284544,11.416821,0.284544,1.344025,731.905493,18.216968,...,0,0,0,0,0,0,0,0,0,0
1,ZINC86050507,ZINC86050507,,12.863025,0.24392,12.863025,0.24392,1.839978,1269.557097,20.241912,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,NAME,CAS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,BalabanJ,BertzCT,Chi0,...,NR2F2,NR3B1,NR3B2,NR3B3,NR3C4,NR4A1,NR4A2,NR5A1,NR5A2,PID
0,ZINC85532298,ZINC85532298,,11.416821,-0.284544,11.416821,0.284544,1.344025,731.905493,18.216968,...,36.26,31.5,2.56,30.4,13.55,35.9,30.04,25.64,37.73,NR1C1
1,ZINC85532298,ZINC85532298,,11.416821,-0.284544,11.416821,0.284544,1.344025,731.905493,18.216968,...,33.45,36.73,7.64,30.18,32.36,39.64,36.36,26.18,30.18,NR1C2


Unnamed: 0,PID,ID,NAME,Prob
0,NR1C1,ZINC85532298,ZINC85532298,0.591617
1,NR1C2,ZINC85532298,ZINC85532298,0.685543
