In [None]:
import deepchem as dc
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
import pyprithvi
pyprithvi.set_backend_url('https://cb.deepforestsci.com/')
pyprithvi.healthcheck()

# Please login to Prithvi
pyprithvi.login(username='', password='')
pyprithvi.set_session_profile("LLNL")
pyprithvi.set_session_project('bench_10M_mol_token_e5')
pyprithvi.set_billing_organization("LLNL")

In [13]:
dataset = 'delaney'
# dataset = 'regressBACE'
# dataset = 'clearance'
# dataset = 'lipo'

In [4]:
for learning_rate in [1e-05]:
	for batch_size in [15]:
		for nb_epoch in [50]:
			model_key = f'finetuned-chemberta-zinc50m-{dataset}-batch{batch_size}-epoch{nb_epoch}-lr{learning_rate}'
			response = pyprithvi.train(
				dataset_address=f'chiron://model_library/featurized_moleculenet_datasets/feat_{dataset}_finetune_train',
				model_type='chemberta',
				model_key= model_key,
				init_kwargs={'learning_rate': learning_rate, 'batch_size': batch_size},
				train_kwargs={'nb_epoch': nb_epoch},
				pretrained_model_address="chiron://model_library/chemberta_pretrained_models/chemberta_50m_mlm",
				task='regression',
				job_config_id='cjc-13')

			print(response)

{'job_id': '3d381306-1a4d-448b-b617-5a2fda7bd252', 'operation_cost': 0.0416724}


In [5]:
pyprithvi.get_job_status('f59ac20e-fe15-43e8-9a81-b4867b35371a')

'SUCCEEDED'

In [7]:
# for bace dataset use dataset_column = 'mol'

for data in ["train", "test", "valid"]:
    for learning_rate in [1e-05]:
        for batch_size in [15]:
            for nb_epoch in [50]:
                    response = pyprithvi.infer(
                        model_address=f'finetuned-chemberta-zinc50m-{dataset}-batch{batch_size}-epoch{nb_epoch}-lr{learning_rate}',
                        dataset_address=f"chiron://model_library/featurized_moleculenet_datasets/feat_{dataset}_finetune_{data}",
                        output_key=f"infer_{data}-chemberta-zinc50m-{dataset}-batch{batch_size}-epoch{nb_epoch}-lr{learning_rate}", 
                        dataset_column="smiles", 
                        job_config_id='cjc-13')
                    print(response)

{'job_id': 'b27941d3-640f-4e0b-b2db-48281dacfa8d', 'operation_cost': 0.10418099999999998}
{'job_id': '629123f1-914a-4728-bf3c-0b2e5a0554c6', 'operation_cost': 0.0130515}
{'job_id': 'f59ac20e-fe15-43e8-9a81-b4867b35371a', 'operation_cost': 0.0130515}


In [12]:
# change the address to download the inference files
import os
if not os.path.exists('infer_data'):
    os.makedirs('infer_data')

for data in ["train", "test", "valid"]:
    for learning_rate in [1e-05]:
        for batch_size in [15]:
            for nb_epoch in [50]:
                pyprithvi.download_file(address=f"chiron://LLNL/bench_10M_mol_token_e5/infer_{data}-chemberta-zinc50m-{dataset}-batch{batch_size}-epoch{nb_epoch}-lr{learning_rate}.csv", filename=f"./infer_data/infer_{data}-chemberta-zinc50m-{dataset}-batch{batch_size}-epoch{nb_epoch}-lr{learning_rate}.csv")

INFO:pyprithvi.data:File download successful!
INFO:pyprithvi.data:File download successful!
INFO:pyprithvi.data:File download successful!


In [17]:
import pandas as pd
data = pd.read_csv(f"./moleculenet_feat_data/molnet_datasets/{dataset}.csv")
data.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


In [18]:
# change the column name according to the dataset selected
# BACE - pIC50
# clearance - target
# delaney - measured log solubility in mols per litre
# lipo - exp

mean_value = data['measured log solubility in mols per litre'].mean()
print(mean_value)

std_deviation = data['measured log solubility in mols per litre'].std()
print(std_deviation)

-3.05010195035461
2.096441210089345


In [21]:

dataset_train = dc.data.DiskDataset(f"./moleculenet_feat_data/feat_{dataset}_finetune_train")
dataset_valid = dc.data.DiskDataset(f"./moleculenet_feat_data/feat_{dataset}_finetune_valid")
dataset_test = dc.data.DiskDataset(f"./moleculenet_feat_data/feat_{dataset}_finetune_test")

INFO:deepchem.data.datasets:Loading dataset from disk.
INFO:deepchem.data.datasets:Loading dataset from disk.
INFO:deepchem.data.datasets:Loading dataset from disk.


In [22]:
print(len(dataset_train))
print(len(dataset_valid))
print(len(dataset_test))

902
113
113


In [23]:
y_true_train = dataset_train.y
y_true_valid = dataset_valid.y
y_true_test = dataset_test.y
true_data_dict = {"train": y_true_train, "valid": y_true_valid, "test": y_true_test}

In [24]:
from sklearn.metrics import mean_squared_error
import numpy as np
for data in ["train", "test", "valid"]:
    for learning_rate in [1e-05]:
        for batch_size in [15]:
            for nb_epoch in [50]:
                try:
                    infer_data = pd.read_csv(f"./infer_data/infer_{data}-chemberta-zinc50m-{dataset}-batch{batch_size}-epoch{nb_epoch}-lr{learning_rate}.csv")
                except:
                    continue
                y_preds = infer_data['y_preds']
                y_true = true_data_dict[data]
                denormalized_y_true = (y_true * std_deviation) + mean_value
                denormalized_y_preds = (y_preds * std_deviation) + mean_value
                rmse_score = np.sqrt(mean_squared_error(denormalized_y_true, denormalized_y_preds))
                print(f"{dataset}_lr{learning_rate}_{data}_batch{batch_size}_epoch{nb_epoch}: {rmse_score}")

delaney_lr1e-05_train_batch15_epoch50: 0.5694352963520167
delaney_lr1e-05_test_batch15_epoch50: 1.079873358040226
delaney_lr1e-05_valid_batch15_epoch50: 1.126100267267294
