In [80]:
import pandas as pd
import numpy as np
from indigo import *
indigo = Indigo()

indigo.setOption("ignore-stereochemistry-errors", True)
indigo.setOption("ignore-bad-valence", True)

indigo.setOption('fp-sim-qwords', 8)
indigo.setOption("fp-ord-qwords", 25)
indigo.setOption('fp-any-qwords', 15)
indigo.setOption('fp-tau-qwords', 25)
indigo.setOption('fp-ext-enabled', True)

def get_finger(struct):
    mol = indigo.loadMolecule(struct)
    try:
        finger = mol.fingerprint('full')
    except IndigoException as e:
        return None
    else:
        return finger.toBuffer()




df = pd.read_csv('Adrenergic_dataset.csv')
df['finger'] = df['Structure'].map(get_finger)
df = df[df['finger'].notna()]



df_ph = df[(df['logP'].notna())]
df_ad_val = df[df['AdrA1A_PCHEMBL_VALUE'].notna()]




ph_labels = np.array(df_ph['logP'])
ph_vals = np.array([*df_ph['finger']])





ad_labels = np.array(df_ad_val['AdrA1A_PCHEMBL_VALUE'])
ad_vals = np.array([*df_ad_val['finger']])

# fp-ord-qwords 25

# fp-sim-qwords 8

# fp-any-qwords 15

# fp-tau-qwords 25

(841, 1)

In [68]:
np.array([np.frombuffer(x, dtype=np.uint8) for x in df_ad_val['finger']])

array([[ 15,   0,   0, ..., 144,  70,  77],
       [ 31,   0,   0, ...,  88, 115, 247],
       [  7,   3,   0, ..., 144,  19,  31],
       ...,
       [ 31,  12,   0, ..., 188,  87, 103],
       [ 63,   0,   0, ..., 156,  67, 111],
       [ 63,   0,   0, ..., 156,  67, 111]], dtype=uint8)

In [74]:
np.array([*df_ad_val['finger']]).shape

(841, 587)

# scripts

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

def get_r2_score(vals, labels, regressor):
    train_features, test_features, train_labels, test_labels = train_test_split(vals, labels, 
                                                                            test_size = 0.25, random_state = 42)
    regressor.fit(train_features, train_labels)
    preds = regressor.predict(test_features)
    return r2_score(test_labels, preds)

# random fosrest

In [13]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

#### logP

In [17]:
get_r2_score(ph_vals, ph_labels, rf)

0.7384663419773593

#### AdrA1A_PCHEMBL_VALUE

In [18]:
get_r2_score(ad_vals, ad_labels, rf)

0.48062209347425056

# support vectors

In [47]:
from sklearn.svm import SVR


svr = SVR()

#### logP

In [54]:
get_r2_score(ph_vals, ph_labels, svr)

0.7222366027933788

#### AdrA1A_PCHEMBL_VALUE

In [20]:
get_r2_score(ad_vals, ad_labels, svr)

0.4464478981864477