# Assess Search
Get some statistics from brute-forcing the search, such as the expected performance of random sampling

In [1]:
from scipy.stats import sem
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

Configuration

In [2]:
result_file = 'E15.csv.gz'  # Where to read from
prec = [1, 0.1, 0.01, 0.001, 0.0001]  # Which percentiles to compute
search_size = 1000000  # How many molecules to same in random search
n_tests = 64

## Load in Results
They are produced in an earlier notebook

In [3]:
%%time
data = pd.read_csv(result_file)  # Remember to uncomment end later
print(f'Loaded {len(data)} entries')

Loaded 15547091 entries
CPU times: user 27 s, sys: 1.47 s, total: 28.4 s
Wall time: 28.4 s


Drop rows with missing IC50's, which occur if the molecule is not fully connected

In [4]:
data = data[~data.IC50_mpnn.isnull()]
print(f'Removed nulls. New size {len(data)}')

Removed nulls. New size 15538418


## Compute Top Percentiles
How good are the best molecules?

In [5]:
%%time
values = np.percentile(data['IC50_mpnn'], prec)

CPU times: user 104 ms, sys: 15.5 ms, total: 120 ms
Wall time: 119 ms


In [6]:
with open('top_ic50s.json', 'w') as fp:
    json.dump({
        'top_perc': prec,
        'ic50': values.tolist(),
        'best': data.IC50_mpnn.min()
    }, fp, indent=2)

## Simulate a Random Search
Simulate a random search by drawing compounds at random

In [7]:
cummins = []
below_zero = []
for _ in tqdm(range(n_tests)):
    choices = data.sample(search_size)
    cummins.append(choices['IC50_mpnn'].cummin())
    below_zero.append(np.cumsum(choices.IC50_mpnn < 0))

100%|██████████| 64/64 [01:13<00:00,  1.14s/it]


In [8]:
cummins = np.vstack(cummins)
below_zero = np.vstack(below_zero)

Save the mean and standard error for the cumulative min

In [9]:
%%time
min_sem_val = sem(cummins, axis=0)
min_mean = np.mean(cummins, axis=0)
bz_sem_val = sem(below_zero, axis=0)
bz_mean = np.mean(below_zero, axis=0)

CPU times: user 844 ms, sys: 8.13 ms, total: 852 ms
Wall time: 854 ms


In [10]:
pd.DataFrame({
    'min_mean': min_mean,
    'min_sem': min_sem_val,
    'n_below_zero_mean': bz_mean,
    'n_below_zero_sem': bz_sem_val
}).to_csv('random_search_perf.csv')