In [1]:
%load_ext autoreload
%autoreload 2

import argparse
import pandas as pd
import numpy as np
import pathlib
import gzip
import time
import json
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.FilterCatalog import *

import sys
sys.path.insert(0,'../src/')
from baseline_similarity import SimilarityBaseline

simbaseline_model_file='../predict_REAL_db/SimilarityBaseline_preds/baseline_similarity.npy'
# load similarity baseline model
with open('../predict_REAL_db/SimilarityBaseline_preds/baseline_similarity.json', 'r') as f:
    task_conf = json.load(f)
simbaseline_model = SimilarityBaseline(conf=task_conf)
simbaseline_model.load_model(simbaseline_model_file)

data_df = pd.read_csv('E:/Github/zinc/datasets/Zenodo/v1/master_df.csv.gz')



In [15]:
for _ in range(5):
    sample_df = data_df.sample(n=5000)
    X = np.vstack([np.fromstring(x, 'u1') - ord('0') for x in sample_df['1024 MorganFP Radius 2']]).astype(float)
    
    start_t = time.time()
    tan_slow = simbaseline_model._baseline_pred(X)
    end_t = time.time()
    total_t = (end_t - start_t) / 60.0
    print('slow: {} min'.format(total_t))

    start_t = time.time()
    tan_fast = simbaseline_model._fast_tanimoto_similarity(X, 1024)
    end_t = time.time()
    total_t = (end_t - start_t)
    print('fast: {} sec'.format(total_t))

    print('________________________________________________________________')
            
    res = np.array_equal(tan_slow.flatten(), tan_fast)
    assert res

  X = np.vstack([np.fromstring(x, 'u1') - ord('0') for x in sample_df['1024 MorganFP Radius 2']]).astype(float)


slow: 0.9474108020464579 min
fast: 0.14284849166870117 sec
________________________________________________________________
slow: 0.9330432573954265 min
fast: 0.1458442211151123 sec
________________________________________________________________
slow: 1.0592958529790242 min
fast: 0.15383672714233398 sec
________________________________________________________________
slow: 0.9414135853449503 min
fast: 0.14584565162658691 sec
________________________________________________________________
slow: 0.9411995013554891 min
fast: 0.13585567474365234 sec
________________________________________________________________


In [13]:
import time
start_t = time.time()
tan_slow = simbaseline_model._baseline_pred(X)
end_t = time.time()
total_t = (end_t - start_t) / 60.0
print('slow: {} min'.format(total_t))

start_t = time.time()
tan_fast = simbaseline_model._fast_tanimoto_similarity(X)
end_t = time.time()
total_t = (end_t - start_t)
print('fast: {} sec'.format(total_t))

print('________________________________________________________________')

slow: 1.916525093714396 min
fast: 0.2887003421783447 sec
________________________________________________________________
