# Imports

In [1]:
from sklearn.neural_network import MLPRegressor
import sklearn.metrics as metrics
import matplotlib.pyplot as plt  
import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
import pandas as pd
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
import Functions
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, tpe, fmin
import itertools
import sys
np.set_printoptions(threshold=sys.maxsize)
from hyperopt.pyll import scope
from hyperopt import Trials

# Preprocessing 

In [2]:
x=pd.read_csv("updatedPK no features.csv")
del x ['Cmpd Name']
x = x.dropna(axis = 0, how='any')
x = x.sort_values(['AUC'], ascending = False)
x.reset_index(drop = True, inplace = True)
smiles = x['mol']
y = x['AUC']
del x['mol']

# Fingerprints

In [3]:
x1 = Functions.fp(smiles, 'mol', x, y, 1024)
x1 = x1.drop(x1.columns[0], axis=1)
del x1['mol']
x1 = pd.concat([x, x1], axis = 1)

# No Var - Corr Applied 

## Normalization

In [4]:
y = x1['AUC']
y = pd.DataFrame(y)
del x1['AUC']
scaler = preprocessing.StandardScaler().fit(y)
Y = scaler.transform(y) 
y2 = pd.DataFrame(Y) 
y2.columns = y.columns
x2 = pd.concat([y2, x1], axis=1)

## Train-test split

In [5]:
x1_train1, x1_test1 = train_test_split(x2, test_size = 0.40, random_state = 1)
x1_cv1, x1_et1 = train_test_split(x1_test1, test_size = 0.5, random_state = 1)

y_train1 = pd.DataFrame(x1_train1['AUC'])
del x1_train1['AUC']

y_cv1 = pd.DataFrame(x1_cv1['AUC'])
del x1_cv1['AUC']

y_et1 = pd.DataFrame(x1_et1['AUC'])
del x1_et1['AUC']

# 1 Hidden Layer

## Hyperopt 

In [6]:
space1 = {
    'activation': hp.choice('activation', ['relu']),
    'solver': hp.choice('solver', ['sgd', 'adam']),
    'alpha': hp.loguniform('alpha', np.log(1e-10), np.log(1e-1)),
    'max_iter': hp.quniform('max_iter', 1000, 100000, 50),
    'validation_fraction': hp.uniform('validation_fraction', 0.0, 0.2),
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive']),
    'learning_rate_init': hp.loguniform('learning_rate_init', np.log(1e-10), np.log(1e-1)),
    'hn1': hp.quniform('hn1', 1, 1024, 1)}

In [7]:
def mlp1(params1):
    hn1 = int(params1['hn1'])
    clf = MLPRegressor(hidden_layer_sizes=(hn1), activation=params1['activation'], solver=params1['solver'], alpha=params1['alpha'], max_iter=int(params1['max_iter']), validation_fraction=params1['validation_fraction'], learning_rate=params1['learning_rate'], learning_rate_init=params1['learning_rate_init'], random_state=1)
    clf.fit(x1_train1, np.ravel(y_train1))
    clf.predict(x1_cv1)
    y_pred =clf.predict(x1_cv1)
    score = mean_squared_error(y_cv1,y_pred)
    return score

In [16]:
tpe_algorithm = tpe.suggest
num_eval = 2000
trials = Trials()
best_param = fmin(mlp1, space1, algo=tpe.suggest, trials=trials, max_evals=num_eval, rstate=np.random.RandomState(1))
print(best_param) 

 11%|█         | 218/2000 [37:06<13:12:05, 26.67s/trial, best loss: 0.5661131703858759]




100%|██████████| 2000/2000 [9:51:31<00:00, 17.75s/trial, best loss: 0.4968109234643623]  
{'activation': 0, 'alpha': 5.573517608334085e-05, 'hn1': 886.0, 'learning_rate': 0, 'learning_rate_init': 6.344967065318529e-07, 'max_iter': 73450.0, 'solver': 1, 'validation_fraction': 0.0509108555704189}


## Metrics 

In [8]:
clf = MLPRegressor(activation = 'relu', alpha = 5.573517608334085e-05, hidden_layer_sizes=(1), learning_rate = 'constant', learning_rate_init = 6.344967065318529e-07, max_iter = 73450, solver = 'adam', validation_fraction = 0.0509108555704189, random_state =1)
clf.fit(x1_train1, np.ravel(y_train1))
Functions.regression_metrics(clf, x1_cv1, y_cv1, x1_et1, y_et1)

Unnamed: 0,$r^{2}$,RMSE
Cross Validation,-1.102734,1.217387
Test,-1.309124,1.054667


# x Hidden Layers

## Hyperopt 

In [9]:
space2 = {
    'activation': hp.choice('activation', ['relu', 'tanh']),
    'solver': hp.choice('solver', ['sgd', 'adam']),
    'alpha': hp.loguniform('alpha', np.log(1e-10), np.log(1e-1)),
    'max_iter': hp.quniform('max_iter', 1000, 100000, 50),
    'validation_fraction': hp.uniform('validation_fraction', 0.0, 0.2),
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive']),
    'learning_rate_init': hp.loguniform('learning_rate_init', np.log(1e-10), np.log(1e-1)),
    'hidden_layers': hp.quniform('hidden_layers', 1, 3, 1),
    'hn1': hp.quniform('hn1', 1, 1024, 1),
    'hn2': hp.quniform('hn2', 1, 1024, 1),
    'hn3': hp.quniform('hn3', 1, 1024, 1)}

In [10]:
def mlp2(params):
    hn1 = int(params['hn1'])
    hn2 = int(params['hn2'])
    hn3 = int(params['hn3'])
    hn_total = hn1, hn2, hn3
    clf = MLPRegressor(hidden_layer_sizes=(hn_total[0:int(params['hidden_layers'])]), activation=params['activation'], solver=params['solver'], alpha=params['alpha'], max_iter=int(params['max_iter']), validation_fraction=params['validation_fraction'], learning_rate=params['learning_rate'], learning_rate_init=params['learning_rate_init'], random_state=1)
    clf.fit(x1_train1, np.ravel(y_train1))
    clf.predict(x1_cv1)
    y_pred =clf.predict(x1_cv1)
    score = mean_squared_error(y_cv1,y_pred)
    return score

In [21]:
tpe_algorithm = tpe.suggest
num_eval = 2000
trials = Trials()
best_param = fmin(mlp2, space2, algo=tpe.suggest, trials=trials, max_evals=num_eval, rstate=np.random.RandomState(1))
print(best_param) 

  3%|▎         | 54/2000 [04:26<5:11:42,  9.61s/trial, best loss: 0.5937395961071829]




 11%|█         | 215/2000 [28:53<3:19:38,  6.71s/trial, best loss: 0.5638797530862132] 




 12%|█▏        | 235/2000 [32:17<5:56:58, 12.14s/trial, best loss: 0.5638797530862132]




 12%|█▏        | 246/2000 [34:00<6:13:52, 12.79s/trial, best loss: 0.5638797530862132]




 15%|█▌        | 300/2000 [48:16<6:39:53, 14.11s/trial, best loss: 0.5638797530862132] 




 16%|█▋        | 328/2000 [52:34<3:47:29,  8.16s/trial, best loss: 0.5638797530862132]




 37%|███▋      | 738/2000 [3:27:11<8:51:38, 25.28s/trial, best loss: 0.4354476932526391] 




100%|██████████| 2000/2000 [5:56:03<00:00, 10.68s/trial, best loss: 0.4016165953416459]    
{'activation': 1, 'alpha': 3.342153230033389e-05, 'hidden_layers': 3.0, 'hn1': 245.0, 'hn2': 726.0, 'hn3': 2.0, 'learning_rate': 0, 'learning_rate_init': 7.767246822433844e-05, 'max_iter': 70450.0, 'solver': 1, 'validation_fraction': 0.130360883269387}


## Metrics 

In [13]:
clf2 = MLPRegressor(activation = 'tanh', alpha = 3.342153230033389e-05, hidden_layer_sizes=(3), learning_rate='constant', learning_rate_init = 7.767246822433844e-05, max_iter = 70450, solver = 'adam', validation_fraction = 0.130360883269387, random_state = 1)
clf2.fit(x1_train1, np.ravel(y_train1))
Functions.regression_metrics(clf2, x1_cv1, y_cv1, x1_et1, y_et1)

Unnamed: 0,$r^{2}$,RMSE
Cross Validation,0.190823,0.755194
Test,0.490249,0.495531


# Var-Thresh + Corr Matrix 

In [14]:
x1_var = Functions.variance_threshold_selector(x1, threshold = 0.2)
cor_matrix = x1_var.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >= 0.9)]
print(); print(to_drop)


[32, 171, 175, 192, 255, 656, 715, 753, 838, 893, 939]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))


In [15]:
x1_var=x1_var.drop(columns=[32, 171, 175, 192, 255, 656, 715, 753, 838, 893, 939])
x2_var = pd.concat([y2, x1_var], axis=1)

## Train-Test Split

In [16]:
x1_train2, x1_test2 = train_test_split(x2_var, test_size = 0.40, random_state = 1)
x1_cv2, x1_et2 = train_test_split(x1_test2, test_size = 0.5, random_state = 1)
y_train2 = pd.DataFrame(x1_train2['AUC'])
del x1_train2['AUC']

y_cv2 = pd.DataFrame(x1_cv2['AUC'])
del x1_cv2['AUC']

y_et2 = pd.DataFrame(x1_et2['AUC'])
del x1_et2['AUC']

# 1 Hidden Layer

## Hyperopt 

In [17]:
space3 = {
    'activation': hp.choice('activation', ['relu']),
    'solver': hp.choice('solver', ['sgd', 'adam']),
    'alpha': hp.loguniform('alpha', np.log(1e-10), np.log(1e-1)),
    'max_iter': hp.quniform('max_iter', 1000, 100000, 50),
    'validation_fraction': hp.uniform('validation_fraction', 0.0, 0.2),
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive']),
    'learning_rate_init': hp.loguniform('learning_rate_init', np.log(1e-10), np.log(1e-1)),
    'hn1': hp.quniform('hn1', 1, 1024, 1)}

In [28]:
def mlp3(params1):
    hn1 = int(params1['hn1'])
    clf = MLPRegressor(hidden_layer_sizes=(hn1), activation=params1['activation'], solver=params1['solver'], alpha=params1['alpha'], max_iter=int(params1['max_iter']), validation_fraction=params1['validation_fraction'], learning_rate=params1['learning_rate'], learning_rate_init=params1['learning_rate_init'], random_state=1)
    clf.fit(x1_train2, np.ravel(y_train2))
    clf.predict(x1_cv2)
    y_pred =clf.predict(x1_cv2)
    score = mean_squared_error(y_cv2,y_pred)
    return score

In [29]:
tpe_algorithm = tpe.suggest
num_eval = 2000
trials = Trials()
best_param = fmin(mlp3, space3, algo=tpe.suggest, trials=trials, max_evals=num_eval, rstate=np.random.RandomState(1))
print(best_param) 

100%|██████████| 2000/2000 [14:32<00:00,  2.29trial/s, best loss: 0.32652975504598186]
{'activation': 0, 'alpha': 1.4105362699669513e-05, 'hn1': 638.0, 'learning_rate': 1, 'learning_rate_init': 0.0948175168279281, 'max_iter': 88250.0, 'solver': 1, 'validation_fraction': 0.10503238822854696}


## Metrics 

In [18]:
clf3 = MLPRegressor(activation='relu', alpha = 1.4105362699669513e-05, hidden_layer_sizes=(1), learning_rate = 'adaptive', learning_rate_init = 0.0948175168279281, max_iter = 88250, solver = 'adam', validation_fraction = 0.10503238822854696, random_state = 1)
clf3.fit(x1_train2, np.ravel(y_train2))
Functions.regression_metrics(clf3, x1_cv2, y_cv2, x1_et2, y_et2)

Unnamed: 0,$r^{2}$,RMSE
Cross Validation,-0.810758,1.12971
Test,0.181409,0.62795


# x Hidden Layers

## Hyperopt

In [19]:
space4 = {
    'activation': hp.choice('activation', ['relu', 'tanh']),
    'solver': hp.choice('solver', ['sgd', 'adam']),
    'alpha': hp.loguniform('alpha', np.log(1e-10), np.log(1e-1)),
    'max_iter': hp.quniform('max_iter', 1000, 100000, 50),
    'validation_fraction': hp.uniform('validation_fraction', 0.0, 0.2),
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive']),
    'learning_rate_init': hp.loguniform('learning_rate_init', np.log(1e-10), np.log(1e-1)),
    'hidden_layers': hp.quniform('hidden_layers', 1, 3, 1),
    'hn1': hp.quniform('hn1', 1, 24, 1),
    'hn2': hp.quniform('hn2', 1, 24, 1),
    'hn3': hp.quniform('hn3', 1, 24, 1)}
    

In [20]:
def mlp4(params):
    hn1 = int(params['hn1'])
    hn2 = int(params['hn2'])
    hn3 = int(params['hn3'])
    hn_total = hn1, hn2, hn3
    clf = MLPRegressor(hidden_layer_sizes=(hn_total[0:int(params['hidden_layers'])]), activation=params['activation'], solver=params['solver'], alpha=params['alpha'], max_iter=int(params['max_iter']), validation_fraction=params['validation_fraction'], learning_rate=params['learning_rate'], learning_rate_init=params['learning_rate_init'], random_state=1)
    clf.fit(x1_train2, np.ravel(y_train2))
    clf.predict(x1_cv2)
    y_pred =clf.predict(x1_cv2)
    score = mean_squared_error(y_cv2,y_pred)
    return score

In [32]:
tpe_algorithm = tpe.suggest
num_eval = 2000
trials = Trials()
best_param = fmin(mlp4, space4, algo=tpe.suggest, trials=trials, max_evals=num_eval, rstate=np.random.RandomState(1))
print(best_param) 

  4%|▍         | 79/2000 [00:15<08:25,  3.80trial/s, best loss: 0.6104215026892066]




100%|██████████| 2000/2000 [12:00<00:00,  2.78trial/s, best loss: 0.30432236108514477]
{'activation': 0, 'alpha': 0.0005348231787252624, 'hidden_layers': 2.0, 'hn1': 17.0, 'hn2': 13.0, 'hn3': 16.0, 'learning_rate': 0, 'learning_rate_init': 0.03334278685932487, 'max_iter': 33650.0, 'solver': 1, 'validation_fraction': 0.07249783942314608}


## Results 

In [21]:
clf4 = MLPRegressor(activation='relu', alpha = 0.0005348231787252624, hidden_layer_sizes=(2), learning_rate = 'constant', learning_rate_init = 0.03334278685932487, max_iter = 33650, solver = 'adam', validation_fraction = 0.07249783942314608, random_state = 1)
clf4.fit(x1_train2, np.ravel(y_train2))
Functions.regression_metrics(clf4, x1_cv2, y_cv2, x1_et2, y_et2)

Unnamed: 0,$r^{2}$,RMSE
Cross Validation,-0.871911,1.148628
Test,0.031205,0.683136
