# Preliminary BERT-based model for drug repurposing

## Import required packages

In [None]:
# mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
! CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import torch
import numpy as np
import pandas as pd
import pickle
import json
import re
from collections import OrderedDict

In [None]:
torch.__version__

'1.10.0+cu111'

In [None]:
!nvidia-smi

Sun Nov 28 04:49:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

###**testing**

In [None]:
import pandas as pd

In [None]:
# fasta sequence reference from PDB database
# https://www.rcsb.org/structure/3VB7
test_proteins = ['SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDTVYCPRHVICTAEDMLNPNYEDLLIRKSNHSFLVQAGNVQLRVIGHSMQNCLLRLKVDTSNPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNHTIKGSFLNGSCGSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGKFYGPFVDRQTAQAAGTDTTITLNVLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDILGPLSAQTGIAVLDMCAALKELLQNGMNGRTILGSTILEDEFTPFDVVRQCSGVTFQ'] * 10

test_protein_tensor = torch.tensor(protein_to_vector(test_proteins))

In [None]:
def drug_to_tensor(smiles):
  test_drugs = []
  for smile in smiles:
    test_drugs.append(smile)
  test_drug_vector = drug_to_vector(test_drugs)
  test_drug_tensor = torch.tensor(test_drug_vector)
  return test_drug_tensor

In [None]:
def predict(drug,protein):
  predicts = []
  ground_truths = []
  with torch.no_grad():
    model.to(device)
    model.eval()
    for i in range (len(drug)//10):
      drugs, proteins = drug[i*10:i*10+10].to(device), protein.to(device)
      predict = model(drugs,proteins)
      predicts += list(predict)
  return predicts

In [None]:
def concat_binding(predict_value,df):
  binding_affinity = []
  for value in predict_value:
    binding_affinity.append(value.item())
  df.insert(len(df.columns),'binding_affinity',binding_affinity)
  return df

###**Testing on chembl dataset**

In [None]:
chembl_path = '/content/drive/MyDrive/bio_project/chembl/chembl_output.csv'
df_chembl = pd.read_csv(chembl_path,index_col=0)
df_chembl.head()

Unnamed: 0,smiles,logP,qed
0,C=C(C)C(C)C,2.2185,0.451964
1,CCCC(C)C,2.4425,0.524779
2,CC(C)C(C)C,2.2984,0.49783
3,CCC(S)CC,2.1048,0.542195
4,CC1=CC=C1C,1.8926,0.45225


In [None]:
chembl_drug = drug_to_tensor(df_chembl['smiles'])
chembl_predict = predict(chembl_drug,test_protein_tensor)
df_chembl_concat = concat_binding(chembl_predict,df_chembl)

In [None]:
df_chembl_concat.head()

Unnamed: 0,smiles,logP,qed,binding_affinity
0,C=C(C)C(C)C,2.2185,0.451964,5.72857
1,CCCC(C)C,2.4425,0.524779,5.993808
2,CC(C)C(C)C,2.2984,0.49783,5.780473
3,CCC(S)CC,2.1048,0.542195,6.210887
4,CC1=CC=C1C,1.8926,0.45225,5.866986


In [None]:
df_chembl_concat.describe()

Unnamed: 0,logP,qed,binding_affinity
count,100.0,100.0,100.0
mean,4.178036,0.559319,5.849489
std,1.710954,0.102879,0.230022
min,1.7519,0.190986,5.249481
25%,2.856375,0.506744,5.693238
50%,3.7448,0.566942,5.88753
75%,5.384425,0.625855,6.01154
max,9.2713,0.794572,6.289449


###**Test on zinc dataset**

In [None]:
zinc_path = '/content/drive/MyDrive/bio_project/zinc/zinc250k.csvzinc_output.csv'
df_zinc = pd.read_csv(zinc_path,index_col=0)
df_zinc.head()

Unnamed: 0,smiles,logP,qed
0,CC=CC(C)C,2.2185,0.451964
1,CCC=C(C)C,2.3626,0.452347
2,C#CC=C=C=C,1.1158,0.291686
3,CC=C(C)CC,2.3626,0.452347
4,C#CC(C)=CC,1.5858,0.410933


In [None]:
zinc_row = df_zinc.shape[0]

In [None]:
# cleand output
add = ['0'] * 4
zinc_smile = list(df_zinc['smiles']) + add
zinc_logp = list(df_zinc['logP']) + add
zinc_qed = list(df_zinc['qed']) + add
data = {'smiles':zinc_smile, 'logP':zinc_logp, 'zinc_qed':zinc_qed}
df_cleaned_zinc = pd.DataFrame(data=data)
df_cleaned_zinc.shape

(70, 3)

In [None]:
zinc_drug = drug_to_tensor(df_cleaned_zinc['smiles'])
zinc_predict = predict(zinc_drug,test_protein_tensor)
df_zinc_concat = concat_binding(zinc_predict,df_cleaned_zinc)

In [None]:
selected_zinc_result = pd.DataFrame(data={'smiles':zinc_smile[:zinc_row],\
                                          'logP':zinc_logp[:zinc_row],\
                                          'qed':zinc_qed[:zinc_row],\
                                          'binding affinity': list(df_zinc_concat['binding_affinity'])[:zinc_row]})
selected_zinc_result.head()

Unnamed: 0,smiles,logP,qed,binding affinity
0,CC=CC(C)C,2.2185,0.451964,5.748038
1,CCC=C(C)C,2.3626,0.452347,5.770752
2,C#CC=C=C=C,1.1158,0.291686,6.007748
3,CC=C(C)CC,2.3626,0.452347,5.966787
4,C#CC(C)=CC,1.5858,0.410933,6.02607


In [None]:
selected_zinc_result.describe()

Unnamed: 0,logP,qed,binding affinity
count,66.0,66.0,66.0
mean,1.656964,0.405687,5.968432
std,0.556307,0.060442,0.182586
min,0.3585,0.291605,5.400861
25%,1.2942,0.355993,5.860543
50%,1.66415,0.410285,5.994709
75%,2.13585,0.449146,6.077771
max,2.612,0.526301,6.299644


###**Test on zinc and chembl dataset**

In [None]:
zinc_chembl_path = '/content/drive/MyDrive/bio_project/zinc_chembl/zinc_chembl_output.csv'
df_zinc_chembl = pd.read_csv(zinc_chembl_path,index_col=0)
df_zinc_chembl.head()

Unnamed: 0,smiles,logP,qed
0,CC=C(C)CC,2.3626,0.452347
1,CC#CC=CC,1.5858,0.410933
2,C#CCC(C)C,1.6657,0.447454
3,C#CC(=C)CC,1.5858,0.445006
4,CC#CC#CP,0.8457,0.302385


In [None]:
zinc_chembl_row = df_zinc_chembl.shape[0]
zinc_chembl_row

64

In [None]:
# cleand output
add = ['0'] * 6
zinc_chembl_smile = list(df_zinc_chembl['smiles']) + add
zinc_chembl_logp = list(df_zinc_chembl['logP']) + add
zinc_chembl_qed = list(df_zinc_chembl['qed']) + add
data = {'smiles':zinc_chembl_smile, 'logP':zinc_chembl_logp, 'zinc_qed':zinc_chembl_qed}
df_cleaned_zinc_chembl = pd.DataFrame(data=data)
df_cleaned_zinc_chembl.shape

(70, 3)

In [None]:
zinc_chembl_drug = drug_to_tensor(df_cleaned_zinc_chembl['smiles'])
zinc_chembl_predict = predict(zinc_chembl_drug,test_protein_tensor)
df_zinc_chembl_concat = concat_binding(zinc_chembl_predict,df_cleaned_zinc_chembl)

In [None]:
selected_zinc_chembl_result = pd.DataFrame(data={'smiles':zinc_chembl_smile[:zinc_chembl_row],\
                                          'logP':zinc_chembl_logp[:zinc_chembl_row],\
                                          'qed':zinc_chembl_qed[:zinc_chembl_row],\
                                          'binding affinity': list(df_zinc_chembl_concat['binding_affinity'])[:zinc_chembl_row]})
selected_zinc_chembl_result.head()

Unnamed: 0,smiles,logP,qed,binding affinity
0,CC=C(C)CC,2.3626,0.452347,5.966787
1,CC#CC=CC,1.5858,0.410933,6.036074
2,C#CCC(C)C,1.6657,0.447454,5.58976
3,C#CC(=C)CC,1.5858,0.445006,6.096067
4,CC#CC#CP,0.8457,0.302385,6.19372


In [None]:
selected_zinc_chembl_result.describe()

Unnamed: 0,logP,qed,binding affinity
count,64.0,64.0,64.0
mean,1.676505,0.412336,6.02496
std,0.477883,0.061376,0.176659
min,0.563,0.291605,5.58976
25%,1.3489,0.399525,5.943217
50%,1.7141,0.413953,6.046926
75%,2.03945,0.449748,6.158419
max,2.6086,0.55908,6.273506


###**Test on groundtruth**

In [None]:
path = '/content/drive/MyDrive/bio_project/chembl/groudtruth.csv'
chembl = pd.read_csv(path,index_col=0)
chembl.head()

Unnamed: 0_level_0,smiles,logP,qed,bioactivity_class
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,1.89262,0.757559,intermediate
CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,3.8132,0.487042,intermediate
CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,2.6605,0.485762,inactive
CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,3.6308,0.683944,inactive
CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],3.539,0.348717,intermediate


In [None]:
chembl_row = chembl.shape[0]

In [None]:
# cleand output
add = ['0'] * 7
chembl_smile = list(chembl['smiles']) + add
chembl_logp = list(chembl['logP']) + add
chembl_qed = list(chembl['qed']) + add
chembl_class = list(chembl['bioactivity_class']) + add
data = {'smiles':chembl_smile, 'logP':chembl_logp, 'qed':chembl_qed, 'bioactivity_class':chembl_class}
chembl_cleaned = pd.DataFrame(data=data)
chembl_cleaned.shape

(140, 4)

In [None]:
chembl_drug_groudtruth = drug_to_tensor(chembl_cleaned['smiles'])
chembl_predict_groudtruth = predict(chembl_drug_groudtruth,test_protein_tensor)
chembl_concat = concat_binding(chembl_predict_groudtruth,chembl_cleaned)

In [None]:
selected_chembl_result = pd.DataFrame(data={'smiles':chembl_smile[:chembl_row],\
                                          'logP':chembl_logp[:chembl_row],\
                                          'qed':chembl_qed[:chembl_row],\
                                          'binding affinity': list(chembl_concat['binding_affinity'])[:chembl_row],\
                                          'bioactivity_class':chembl_class[:chembl_row]})
selected_chembl_result.head()

Unnamed: 0,smiles,logP,qed,binding affinity,bioactivity_class
0,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,1.89262,0.757559,5.268956,intermediate
1,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,3.8132,0.487042,5.500473,intermediate
2,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,2.6605,0.485762,5.691903,inactive
3,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,3.6308,0.683944,5.535729,inactive
4,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],3.539,0.348717,5.703414,intermediate


In [None]:
chembl_active = selected_chembl_result[selected_chembl_result['bioactivity_class']=='active']
chembl_intermediate = selected_chembl_result[selected_chembl_result['bioactivity_class']=='intermediate']
chembl_inactive = selected_chembl_result[selected_chembl_result['bioactivity_class']=='inactive']
print(f"active set: {chembl_active['bioactivity_class'].unique().item()}\n\
intermediate set: {chembl_intermediate['bioactivity_class'].unique().item()}\n\
inactive set: {chembl_inactive['bioactivity_class'].unique().item()}")

active set: active
intermediate set: intermediate
inactive set: inactive


In [None]:
#active
chembl_active.describe()

Unnamed: 0,logP,qed,binding affinity
count,15.0,15.0,15.0
mean,3.777084,0.627761,5.547517
std,1.056293,0.156704,0.194753
min,2.4107,0.206778,5.20768
25%,2.8155,0.613257,5.4184
50%,3.7004,0.674688,5.558991
75%,4.31435,0.731628,5.623617
max,6.1012,0.76618,6.031115


In [None]:
#intermediate
chembl_intermediate.describe()

Unnamed: 0,logP,qed,binding affinity
count,14.0,14.0,14.0
mean,3.594304,0.569472,5.550263
std,1.165892,0.157902,0.274628
min,1.1272,0.286913,5.070745
25%,3.31852,0.490015,5.326273
50%,3.68656,0.553261,5.602699
75%,4.02503,0.664966,5.704352
max,6.3047,0.862078,6.071366


In [None]:
#inactive
chembl_inactive.describe()

Unnamed: 0,logP,qed,binding affinity
count,104.0,104.0,104.0
mean,3.969391,0.465817,5.553468
std,1.455105,0.197367,0.254877
min,-0.0554,0.035506,5.054952
25%,3.22032,0.305394,5.408427
50%,3.99644,0.43833,5.531061
75%,4.808425,0.650963,5.71752
max,7.05154,0.924827,6.287441


In [None]:
selected_chembl_result.describe()

Unnamed: 0,logP,qed,binding affinity
count,133.0,133.0,133.0
mean,3.908219,0.494992,5.55246
std,1.38554,0.196768,0.249289
min,-0.0554,0.035506,5.054952
25%,3.22032,0.336282,5.408427
50%,3.7699,0.485762,5.549126
75%,4.66712,0.664966,5.708304
max,7.05154,0.924827,6.287441


###**Export**

In [None]:
export_path = '/content/drive/MyDrive/bio_project/summary/'

In [None]:
df_chembl_concat.to_csv(export_path+'chembl_dataset_summary.csv',index=False)
selected_zinc_result.to_csv(export_path+'zinc_dataset_summary.csv',index=False)
selected_zinc_chembl_result.to_csv(export_path+'zinc_chembl_dataset.csv',index=False)
selected_chembl_result.to_csv(export_path+'groundtruth_summary.csv',index=False)
chembl_active.to_csv(export_path+'groundtruth_active_summary.csv',index=False)
chembl_intermediate.to_csv(export_path+'groundtruth_intermediate_summary.csv',index=False)
chembl_inactive.to_csv(export_path+'groundtruth_inactive_summary.csv',index=False)