In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw



In [2]:
df = pd.read_csv("data/fukunishi_data.csv")

In [3]:
df.head()

Unnamed: 0,No.,Compound ID,SMILES,Assay ID,LogP app
0,1,CHEMBL121893,[NH+]1(CCCCC1)CCC(=O)c1ccc(OCCCCCC)cc1\t\n,CHEMBL1034536,-2.67
1,2,CHEMBL538150,C(=O)(c1cc2c(cc(OCCCCCC)cc2)cc1)CC[NH+](C)C\t\n,CHEMBL1034536,-2.85
2,3,CHEMBL539139,C(=O)(c1ccc(OCCOCC)cc1)CC[NH+](C)C\t\n,CHEMBL1034536,-3.8
3,4,CHEMBL539393,C(=O)(c1cc(C)c(OCCCCCC)cc1)CC[NH+](C)C\t\n,CHEMBL1034536,-2.91
4,5,CHEMBL539718,C(=O)(c1c(Cl)cc(SCCCCCC)cc1)CC[NH+](C)C\t\n,CHEMBL1034536,-3.22


# 4-1. RDKitの準備 
PythonにRDKitをインストールし、SMILES式で表現された化合物を読み込んで構造式が出力できるようにせよ。

In [4]:
mols = [Chem.MolFromSmiles(smiles) for smiles in df["SMILES"]]

In [9]:
img = Draw.MolsToGridImage(mols[:12])
img.save("result/4_1.png")

# 4-2. 2D記述子の作成 
所望の化合物に対し、RDKitによって2D記述子 (descriptor) をすべて計算し、それらを並べたベクトルを構成できるようにせよ。

In [13]:
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

In [16]:
def df_2d_descriptor(mols):
    df_2d = pd.DataFrame()
    seri_mols = pd.Series(mols, name='ROMol')
    for i, j in Descriptors.descList:
        df_2d[i]  = seri_mols.map(j)
    return df_2d.reset_index()


In [17]:
df_2d = df_2d_descriptor(mols)

In [18]:
df_2d.head()

Unnamed: 0,index,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,12.276753,0.261638,12.276753,0.261638,0.529981,318.481,286.225,318.242756,128,...,0,0,0,0,0,0,0,0,3,0
1,1,12.24937,0.216991,12.24937,0.216991,0.534284,328.476,298.236,328.227106,130,...,0,0,0,0,0,0,0,0,3,0
2,2,11.88549,0.182469,11.88549,0.182469,0.531211,266.361,242.169,266.17507,106,...,0,0,0,0,0,0,0,0,2,0
3,3,12.090771,0.216478,12.090771,0.216478,0.530771,292.443,262.203,292.227106,118,...,0,0,0,0,0,0,0,0,3,0
4,4,12.108951,0.139533,12.108951,0.139533,0.401309,328.929,301.713,328.14964,118,...,1,0,0,0,0,0,0,0,3,0


In [19]:
df_2d.isnull().sum()

index                     0
MaxEStateIndex            0
MinEStateIndex            0
MaxAbsEStateIndex         0
MinAbsEStateIndex         0
qed                       0
MolWt                     0
HeavyAtomMolWt            0
ExactMolWt                0
NumValenceElectrons       0
NumRadicalElectrons       0
MaxPartialCharge          0
MinPartialCharge          0
MaxAbsPartialCharge       0
MinAbsPartialCharge       0
FpDensityMorgan1          0
FpDensityMorgan2          0
FpDensityMorgan3          0
BalabanJ                  0
BertzCT                   0
Chi0                      0
Chi0n                     0
Chi0v                     0
Chi1                      0
Chi1n                     0
Chi1v                     0
Chi2n                     0
Chi2v                     0
Chi3n                     0
Chi3v                     0
                         ..
fr_methoxy                0
fr_morpholine             0
fr_nitrile                0
fr_nitro                  0
fr_nitro_arom       

In [20]:
df_2d.to_csv("data/2d_desc.csv")

# 4-3. 3D記述子の生成 †
所望の化合物に対し、RDKitによってETKDGv2法による配座生成（３次元構造生成）を行い、その構造を用いて3D記述子をすべて計算し、それらを並べたベクトルを構成できるようにせよ。

In [21]:
from rdkit import rdBase
print(rdBase.rdkitVersion)

2019.09.3


In [22]:
from rdkit.Chem import AllChem
from rdkit import Chem

def ETKDGv2(mols):
    ETKDG_mols = []
    for mol in mols:
        mh = Chem.AddHs(mol) #　水素原子を分子に付加する
        p = AllChem.ETKDGv2()
        AllChem.EmbedMolecule(mh, p)
        ETKDG_mols.append(mh)
    return ETKDG_mols

In [23]:
TKDGv2_m = ETKDGv2(pd.Series(mols, name="ROMol"))

In [26]:
type(TKDGv2_m[1])

rdkit.Chem.rdchem.Mol

In [27]:
from rdkit.Chem import Descriptors3D
from inspect import getmembers, isfunction

def df_3d_descriptor(list_3d):
    seri_3d = pd.Series(list_3d, name="ChemMol")
    df_3d = pd.DataFrame()
    for i, j in [o for o in getmembers(Descriptors3D) if isfunction(o[1])]:
        df_3d[i]  = seri_3d.map(j)
    return df_3d.reset_index()

In [28]:
df_3d = df_3d_descriptor(TKDGv2_m)

In [29]:
df_3d.head()

Unnamed: 0,index,Asphericity,Eccentricity,InertialShapeFactor,NPR1,NPR2,PMI1,PMI2,PMI3,RadiusOfGyration,SpherocityIndex
0,0,0.758272,0.996046,0.001317,0.088837,0.96646,733.88518,7983.930845,8261.0056,5.162939,0.105468
1,1,0.71773,0.994468,0.001048,0.105044,0.925304,882.591099,7774.523465,8402.131539,5.095808,0.077055
2,2,0.72424,0.994755,0.00159,0.102284,0.915403,575.561485,5151.04265,5627.074672,4.616555,0.058911
3,3,0.499097,0.976872,0.001081,0.213823,0.968158,895.657871,4055.404977,4188.78318,3.953064,0.305399
4,4,0.535606,0.98179,0.00073,0.189969,0.894021,1224.895068,5764.540429,6447.883059,4.519498,0.135


In [30]:
df_3d.to_csv("data/3d_desc.csv")

# 4-4. ECFP4 fingerprintの生成 †
所望の化合物に対し、RDKitのECFP4 fingerprint（GetMorganFingerprintAsBitVect(mol, 2, 2048)による）を計算できるようにせよ。

In [30]:
from rdkit.Chem import AllChem, Draw

def df_ecfp4_fingp(mols):
    seri_mols = pd.Series(mols, name="ROMol")
#     df_fing = seri_mols.map(lambda x: AllChem.GetMorganFingerprint(x, 2))
    fing1 = seri_mols.map(lambda x : AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=2048))
    fing = [ list(map(int, list(ff))) for ff in fing1]
    df = pd.DataFrame(fing)
    return df

In [33]:
df_fing = df_ecfp4_fingp(mols)
df_fing.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [35]:
df_fing.to_csv("data/fing_desc.csv")