In [6]:
import pandas as pd
from rdkit import Chem

DATAPATH = "../data/osm_clf.csv"

df = pd.read_csv(DATAPATH)

In [15]:
import numpy as np
from rdkit.Chem import AllChem

radius = 3
useCounts = True
useFeatures = True


class Ecfp(object):
    def __init__(self):
        self.name = "ecfp"
        self.radius = radius
        self.useCounts = useCounts
        self.useFeatures = useFeatures

    def calc(self, mols):
        fps = [
            AllChem.GetMorganFingerprint(
                mol, self.radius, useCounts=self.useCounts, useFeatures=self.useFeatures
            )
            for mol in mols
        ]
        size = 2048
        nfp = np.zeros((len(fps), size), np.int32)
        for i, fp in enumerate(fps):
            for idx, v in fp.GetNonzeroElements().items():
                nidx = idx % size
                nfp[i, nidx] += int(v)
        return np.array(nfp, dtype=np.int32)

In [18]:
def get_fingerprints(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    fingerprinter = Ecfp()
    return fingerprinter.calc(mols)

smiles = list(df["smiles"])
X = get_fingerprints(smiles)

y = np.array(df["bin_activity"])

In [23]:
from sklearn.ensemble import RandomForestClassifier

mdl = RandomForestClassifier()
mdl.fit(X, y)

RandomForestClassifier()

In [30]:
import joblib

joblib.dump(mdl, "../model/my_first_model.pkl")

['../model/my_first_model.pkl']

In [28]:
mdl.predict_proba(X)

array([[0.38      , 0.62      ],
       [0.82      , 0.18      ],
       [0.2       , 0.8       ],
       [0.0925    , 0.9075    ],
       [0.26      , 0.74      ],
       [0.17      , 0.83      ],
       [0.96      , 0.04      ],
       [0.18      , 0.82      ],
       [0.63      , 0.37      ],
       [0.03      , 0.97      ],
       [0.122     , 0.878     ],
       [0.03      , 0.97      ],
       [0.08      , 0.92      ],
       [0.2       , 0.8       ],
       [0.12166667, 0.87833333],
       [0.78      , 0.22      ],
       [0.23      , 0.77      ],
       [0.23      , 0.77      ],
       [0.21      , 0.79      ],
       [0.96      , 0.04      ],
       [0.79      , 0.21      ],
       [0.86      , 0.14      ],
       [0.25      , 0.75      ],
       [0.85      , 0.15      ],
       [0.18      , 0.82      ],
       [0.84      , 0.16      ],
       [0.87      , 0.13      ],
       [0.19      , 0.81      ],
       [0.75      , 0.25      ],
       [0.20969444, 0.79030556],
       [0.

In [31]:
smiles

['FC(F)Oc1ccc(-c2nnc3cncc(CN4Cc5ccccc5C4)n23)cc1',
 'Cc1c(Cl)cccc1NC(=O)c1cncc2nnc(-c3ccc(OC(F)F)cc3)n12',
 'O=C(Nc1cccc(Cl)c1)c1cncc2nnc(-c3ccc(OC(F)F)cc3)n12',
 'N#Cc1ccc(-c2nnc3cncc(OCCc4ccccc4Cl)n23)cc1',
 'Fc1ccc(CCOc2cncc3nnc(-c4ccc(C(F)(F)F)nc4)n23)cc1F',
 'O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)c1cncc2nnc(-c3ccc(OC(F)F)cc3)n12',
 'O=C(Nc1ccc(Cl)cc1)c1c[nH]c(=O)c2nnc(-c3ccc(OC(F)F)cc3)n12',
 'O=C(Nc1cc[n+]([O-])c(C(F)(F)F)c1)c1cncc2nnc(-c3ccc(OC(F)F)cc3)n12',
 'OCC(COc1cncc2nnc(-c3ccc(OC(F)F)cc3)n12)c1ccc(O)cc1',
 'IC(C=C1)=CC=C1C2=NN=C3C=NC=C(OCCC4=CC(F)=C(F)C=C4)N32',
 'O=C(COc1cncc2nnc(-c3ccc(OC(F)F)cc3)n12)c1ccc(F)c(F)c1',
 'COC(COc1cncc2nnc(-c3ccc(OC(F)F)cc3)n12)(OC)c1ccc(F)c(F)c1',
 'FS(F)(F)(F)(F)c1ccc(-c2nnc3cncc(OCCc4ccccc4)n23)cc1',
 'FC(F)Oc1ccc(-c2nnc3cncc(OCCC45CC6CC(CC(C6)C4)C5)n23)cc1',
 'CC(Cc1ccccc1)Oc1cncc2nnc(-c3ccc(OC(F)F)cc3)n12',
 'FC(F)Oc1ccc(-c2nnc3cncc(OCC45CC6CC(CC(C6)C4)C5)n23)cc1',
 'FC(F)Oc1ccc(-c2nnc3cncc(OCC4CC5C=CC4C5)n23)cc1',
 'CC1(C)C2CC=C(CCOc3cncc