# QSAR 모델 개발 및 Ligand-based Virtual Screening

- Input : Smiles
- Feature : ECFP
- Target Values : pChEMBL Value (pIC50)
- Models : Regression model
     - RandomForestRegression
     - FNN(pytorch)
     - Loss : MSE
- Model selection : validation set

## 1. 데이터 전처리

In [2]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import deepchem
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [5]:
raw_chembl_df = pd.read_csv("LRRK2_ChEMBL.csv", sep = ';', encoding='utf-8')

In [6]:
raw_chembl_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value
0,CHEMBL2348984,,,355.32,0,4.55,51,FC(F)(F)c1cccc(Nc2ccc3nnc(-c4ccccc4)n3n2)c1,IC50,'>',...,SINGLE PROTEIN,CHEMBL2346543,1,Scientific Literature,Bioorg Med Chem Lett,2013,,,,
1,CHEMBL2348982,,,450.47,1,5.20,49,CCOC(=O)c1cc(Sc2cccc(C(F)(F)F)c2)nn2c(-c3cccs3...,IC50,'>',...,SINGLE PROTEIN,CHEMBL2346543,1,Scientific Literature,Bioorg Med Chem Lett,2013,,,,
2,CHEMBL2348967,,,395.39,0,4.30,33,FC(F)(F)Oc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,IC50,'=',...,SINGLE PROTEIN,CHEMBL2346543,1,Scientific Literature,Bioorg Med Chem Lett,2013,,,,
3,CHEMBL2348962,,,390.29,0,4.16,28,Brc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,IC50,'=',...,SINGLE PROTEIN,CHEMBL2346543,1,Scientific Literature,Bioorg Med Chem Lett,2013,,,,
4,CHEMBL2348956,,,336.34,0,4.17,21,FC(F)(F)c1cccc(Sc2ccc3nnc(C4CC4)n3n2)c1,IC50,'=',...,SINGLE PROTEIN,CHEMBL2346543,1,Scientific Literature,Bioorg Med Chem Lett,2013,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633,CHEMBL5169734,,,325.38,0,4.55,19,Cc1cccc(Nc2ccc3[nH]nc(-c4ccncc4)c3c2)c1C#N,IC50,'=',...,SINGLE PROTEIN,CHEMBL5154686,1,Scientific Literature,Eur J Med Chem,2022,,,INHIBITOR,
2634,CHEMBL5179942,,,351.41,0,4.60,50,N#Cc1ccc2c(c1)CCC2Nc1ccc2[nH]nc(-c3ccncc3)c2c1,IC50,'=',...,SINGLE PROTEIN,CHEMBL5154686,1,Scientific Literature,Eur J Med Chem,2022,,,INHIBITOR,
2635,CHEMBL5185049,,,337.34,0,4.16,14,N#Cc1ccc(Oc2ccc3[nH]nc(-c4ccncc4)c3c2)c(C#N)c1,IC50,'=',...,SINGLE PROTEIN,CHEMBL5154686,1,Scientific Literature,Eur J Med Chem,2022,,,INHIBITOR,
2636,CHEMBL5189805,,,341.37,0,4.25,18,COc1ccc(C#N)c(Nc2ccc3[nH]nc(-c4ccncc4)c3c2)c1,IC50,'=',...,SINGLE PROTEIN,CHEMBL5154686,1,Scientific Literature,Eur J Med Chem,2022,,,INHIBITOR,


In [8]:

raw_chembl_df.describe()

Unnamed: 0,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Standard Value,pChEMBL Value,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,Potential Duplicate,Assay Tissue ChEMBL ID,Assay Tissue Name,Assay Subcellular Fraction,Source ID,Document Year,Standard Text Value
count,3.0,2638.0,2638.0,2638.0,2571.0,2441.0,2427.0,2427.0,2425.0,2427.0,2638.0,0.0,0.0,0.0,2638.0,2638.0,0.0
mean,3.666667,367.809151,0.077331,3.130686,1327.403884,7.230258,20.240886,0.379131,4.113192,9.261001,0.027293,,,,14.073161,2016.894617,
std,0.57735,65.360836,0.292897,1.010777,7409.463689,0.966685,3.774034,0.067665,1.357443,2.634694,0.162968,,,,17.501686,2.849845,
min,3.0,135.13,0.0,-0.11,0.001,4.04,8.88,0.16,-0.85,3.86,0.0,,,,1.0,2011.0,
25%,3.5,325.38,0.0,2.51,12.59,6.62,17.72,0.33,3.24,7.395,0.0,,,,1.0,2015.0,
50%,4.0,368.44,0.0,3.09,52.0,7.29,20.04,0.38,4.2,8.89,0.0,,,,1.0,2016.0,
75%,4.0,406.49,0.0,3.74,264.5,7.89,22.53,0.42,5.1,10.73,0.0,,,,37.0,2019.0,
max,4.0,727.91,2.0,6.9,101080.0,10.19,37.48,0.7,8.21,34.0,1.0,,,,65.0,2023.0,


In [9]:
raw_chembl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2638 entries, 0 to 2637
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Molecule ChEMBL ID          2638 non-null   object 
 1   Molecule Name               31 non-null     object 
 2   Molecule Max Phase          3 non-null      float64
 3   Molecular Weight            2638 non-null   float64
 4   #RO5 Violations             2638 non-null   int64  
 5   AlogP                       2638 non-null   float64
 6   Compound Key                2638 non-null   object 
 7   Smiles                      2638 non-null   object 
 8   Standard Type               2638 non-null   object 
 9   Standard Relation           2564 non-null   object 
 10  Standard Value              2571 non-null   float64
 11  Standard Units              2571 non-null   object 
 12  pChEMBL Value               2441 non-null   float64
 13  Data Validity Comment       3 non

In [10]:
df = raw_chembl_df[['Smiles', 'pChEMBL Value']].copy(deep=True)
df.head()

Unnamed: 0,Smiles,pChEMBL Value
0,FC(F)(F)c1cccc(Nc2ccc3nnc(-c4ccccc4)n3n2)c1,
1,CCOC(=O)c1cc(Sc2cccc(C(F)(F)F)c2)nn2c(-c3cccs3...,
2,FC(F)(F)Oc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,7.17
3,Brc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,8.22
4,FC(F)(F)c1cccc(Sc2ccc3nnc(C4CC4)n3n2)c1,6.51


### pChEMBL Value 결측치 제거

In [12]:
df.describe(include='all')

Unnamed: 0,Smiles,pChEMBL Value
count,2638,2441.0
unique,1524,
top,Cc1cccc(-c2cc(N)nc3[nH]cc(C#N)c23)c1,
freq,17,
mean,,7.230258
std,,0.966685
min,,4.04
25%,,6.62
50%,,7.29
75%,,7.89


* Smiles > pChEMBL Value : pChEMBL Value 결측치가 존재한다
* 따라서 결측치 제거 필요

In [13]:
df = df.dropna(subset = ['pChEMBL Value'])
df.describe(include = 'all')

Unnamed: 0,Smiles,pChEMBL Value
count,2441,2441.0
unique,1426,
top,Cc1cccc(-c2cc(N)nc3[nH]cc(C#N)c23)c1,
freq,16,
mean,,7.230258
std,,0.966685
min,,4.04
25%,,6.62
50%,,7.29
75%,,7.89


* Smiles와 pChEMBL Value의 count 값이 같이졌다

### Smiles - pChEMBL 중복 값 제거

In [15]:
# 중복 줄 확인
df[df.duplicated(keep=False)]

Unnamed: 0,Smiles,pChEMBL Value
35,C[C@@H]1CN(c2cc(-c3n[nH]c4ccc(OC5(C)CC5)cc34)n...,9.12
36,CC(C)Oc1ccc2[nH]nc(-c3cc(N4CCOCC4)ncn3)c2c1,7.02
37,COc1cc(C(=O)N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc2c...,8.22
58,COC[C@H](C)Oc1ccc2[nH]nc(-c3cc(N4CCC5(CC4)CNC(...,8.54
59,COC[C@H](C)Oc1ccc2[nH]nc(-c3cc(N4CCO[C@H](C)C4...,8.55
...,...,...
2386,C[C@@H]1CCCN1c1c(C#N)c2c(N)nc(Nc3cn[nH]c3)nc2n1C,8.22
2404,COC[C@H](C)Oc1ccc2[nH]nc(-c3cc(N4CCO[C@H](C)C4...,8.55
2427,C[C@@H]1CN(c2cc(-c3n[nH]c4ccc(OC5(C)CC5)cc34)n...,9.00
2453,C[C@H]1CC1(C#N)c1ccc2cnn(-c3cc(N4CC5(C(C)(C)O)...,8.70


In [16]:
# (Smiles, pChEMBL Value)가 중복인 줄은 첫 번째 값만 남기고 나머지는 제거
df = df.drop_duplicates()
df.head()

Unnamed: 0,Smiles,pChEMBL Value
2,FC(F)(F)Oc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,7.17
3,Brc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,8.22
4,FC(F)(F)c1cccc(Sc2ccc3nnc(C4CC4)n3n2)c1,6.51
5,Fc1ccc(-c2nnc3ccc(Sc4cccc(C(F)(F)F)c4)nn23)cc1,6.75
6,COC(=O)C(C)Sc1ccc2ncc(-c3cccs3)n2n1,8.0


### 중복된 Smiles 처리 

In [17]:
df.describe(include = 'all')

Unnamed: 0,Smiles,pChEMBL Value
count,2328,2328.0
unique,1426,
top,Cc1cccc(-c2cc(N)nc3[nH]cc(C#N)c23)c1,
freq,11,
mean,,7.216147
std,,0.97104
min,,4.04
25%,,6.61
50%,,7.28
75%,,7.89


* count > unique : Smiles가 똑같은 줄이 있다
* Smiles는 똑같은데 pChEMBL Value가 다른 데이터에 대해서는 pChEMBL Value를 평균 내어서 넣어줌

In [18]:
# Smiles가 중복되는 줄 모두 가져오기
df_dup = df[df.duplicated(subset = ['Smiles'], keep=False)].copy(deep=False)
df_dup

Unnamed: 0,Smiles,pChEMBL Value
2,FC(F)(F)Oc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,7.17
3,Brc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,8.22
4,FC(F)(F)c1cccc(Sc2ccc3nnc(C4CC4)n3n2)c1,6.51
5,Fc1ccc(-c2nnc3ccc(Sc4cccc(C(F)(F)F)c4)nn23)cc1,6.75
6,COC(=O)C(C)Sc1ccc2ncc(-c3cccs3)n2n1,8.00
...,...,...
2633,Cc1cccc(Nc2ccc3[nH]nc(-c4ccncc4)c3c2)c1C#N,7.42
2634,N#Cc1ccc2c(c1)CCC2Nc1ccc2[nH]nc(-c3ccncc3)c2c1,7.40
2635,N#Cc1ccc(Oc2ccc3[nH]nc(-c4ccncc4)c3c2)c(C#N)c1,7.32
2636,COc1ccc(C#N)c(Nc2ccc3[nH]nc(-c4ccncc4)c3c2)c1,6.43


In [19]:
# Smiles 기준으로 mean()을 사용해서 각 group 마다 pChEMBL Value의 평균값 구하기
df_mean = df_dup.groupby(by='Smiles').mean().reset_index()
df_mean

Unnamed: 0,Smiles,pChEMBL Value
0,Brc1cccc(Sc2ccc3nnc(-c4cncs4)n3n2)c1,7.296667
1,Brc1cncc(-c2c[nH]c3ncnc(N4CCOCC4)c23)c1,8.035000
2,C1=C(c2c[nH]c3ncnc(N4CCOCC4)c23)CCCO1,6.950000
3,C1=C(c2c[nH]c3ncnc(N4CCOCC4)c23)COCC1,6.735000
4,C=CS(=O)(=O)Nc1cccc(Nc2ncnc3cc(OC)c(OC)cc23)c1,7.005000
...,...,...
685,c1cnc2c(-c3c[nH]c4ncnc(N5CCOCC5)c34)cnn2c1,7.185000
686,c1cncc(-c2c[nH]c3ncnc(N4CCOCC4)c23)c1,7.130000
687,c1cnn2c(-c3c[nH]c4ncnc(N5CCOCC5)c34)cnc2c1,8.055000
688,c1nc(N2CCOCC2)c2c(-c3cn[nH]c3)c[nH]c2n1,7.540000


In [20]:
# Smiles 기준 중복인 줄 모두 제거
df_drop = df.drop_duplicates(subset = 'Smiles', keep = False).copy(deep = True)
df_drop.head()

Unnamed: 0,Smiles,pChEMBL Value
15,COc1ccc(OC)c(Sc2cc(C)c3nnc(-c4cnn(C)c4)n3n2)c1,4.5
16,COc1ccc(OC)c(C(O)c2ccc3nnc(-c4cnn(C)c4)n3n2)c1,5.04
17,COc1ccc(C(C)C)cc1Sc1ccc2nnc(-c3cnn(C)c3)n2n1,5.87
19,COc1ccc(OC)c(Sc2ccc3nnc(-c4cnccn4)n3n2)c1,6.06
20,COc1ccc(OC)c(Sc2ccc3nnc(-c4ccncc4)n3n2)c1,6.46


In [21]:
# 원래 데이터프레임에 (smiles, mean value) 데이터 추가
df = pd.concat([df_drop, df_mean])
df

Unnamed: 0,Smiles,pChEMBL Value
15,COc1ccc(OC)c(Sc2cc(C)c3nnc(-c4cnn(C)c4)n3n2)c1,4.500
16,COc1ccc(OC)c(C(O)c2ccc3nnc(-c4cnn(C)c4)n3n2)c1,5.040
17,COc1ccc(C(C)C)cc1Sc1ccc2nnc(-c3cnn(C)c3)n2n1,5.870
19,COc1ccc(OC)c(Sc2ccc3nnc(-c4cnccn4)n3n2)c1,6.060
20,COc1ccc(OC)c(Sc2ccc3nnc(-c4ccncc4)n3n2)c1,6.460
...,...,...
685,c1cnc2c(-c3c[nH]c4ncnc(N5CCOCC5)c34)cnn2c1,7.185
686,c1cncc(-c2c[nH]c3ncnc(N4CCOCC4)c23)c1,7.130
687,c1cnn2c(-c3c[nH]c4ncnc(N5CCOCC5)c34)cnc2c1,8.055
688,c1nc(N2CCOCC2)c2c(-c3cn[nH]c3)c[nH]c2n1,7.540


In [22]:
df.describe(include = 'all')

Unnamed: 0,Smiles,pChEMBL Value
count,1426,1426.0
unique,1426,
top,COc1ccc(OC)c(Sc2cc(C)c3nnc(-c4cnn(C)c4)n3n2)c1,
freq,1,
mean,,7.278293
std,,0.98057
min,,4.045
25%,,6.745
50%,,7.35
75%,,7.94125


* 이제 Smiles의 count와 unique의 값이 같아졌다.
* 데이터 전처리 완료! 

### Column 이름 바꾸기
pChEMBL -> pIC50으로 변경

In [23]:
df = df.rename(columns = {'pChEMBL Value' : 'pIC50'})
df

Unnamed: 0,Smiles,pIC50
15,COc1ccc(OC)c(Sc2cc(C)c3nnc(-c4cnn(C)c4)n3n2)c1,4.500
16,COc1ccc(OC)c(C(O)c2ccc3nnc(-c4cnn(C)c4)n3n2)c1,5.040
17,COc1ccc(C(C)C)cc1Sc1ccc2nnc(-c3cnn(C)c3)n2n1,5.870
19,COc1ccc(OC)c(Sc2ccc3nnc(-c4cnccn4)n3n2)c1,6.060
20,COc1ccc(OC)c(Sc2ccc3nnc(-c4ccncc4)n3n2)c1,6.460
...,...,...
685,c1cnc2c(-c3c[nH]c4ncnc(N5CCOCC5)c34)cnn2c1,7.185
686,c1cncc(-c2c[nH]c3ncnc(N4CCOCC4)c23)c1,7.130
687,c1cnn2c(-c3c[nH]c4ncnc(N5CCOCC5)c34)cnc2c1,8.055
688,c1nc(N2CCOCC2)c2c(-c3cn[nH]c3)c[nH]c2n1,7.540


In [25]:
# 데이터프레임을 csv 파일로 저장
df.to_csv("ChEMBL_LRRK2_IC50.csv", index = True)