In [10]:
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
import matplotlib.pyplot as plt

def read_data(filename, col_smiles=0, col_target=1, start_row=1, delimiter=' ', quotechar='\''):
    smiles = list()
    targets = list()
    with open(filename) as file:
        reader = csv.reader(file, delimiter=delimiter, quotechar=quotechar)
        for _ in range(start_row): #testiraj
            next(reader)
        for row in reader:
            if len(row) > 2:
                row = np.array(row)
                smiles.append(row[col_smiles])
                targets.append(row[col_target])
    return np.array(smiles), np.array(targets)

In [19]:
# read data
smiles,y = read_data('data/BBBP.csv', col_smiles=3,col_target=2, delimiter=',')
# transform SMILES data to Mol
moles = np.array([AllChem.MolFromSmiles(x) for x in smiles])
# remove invalid ones
valid = [i for i,x in enumerate(moles) if x is not None]
y = np.array(y[valid], dtype=np.float)
moles = moles[valid]

fingerprints = [AllChem.GetMorganFingerprintAsBitVect(x, 4) for x in moles]

X = list()
for x in fingerprints:
    arr = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(x, arr)
    X.append(arr)
X = np.array(X)

In [20]:
clf = RandomForestClassifier(n_estimators=500, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

0.779031370390754