# Assess Search
Get some statistics from brute-forcing the search, such as the expected performance of random sampling

In [1]:
from scipy.stats import sem
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

Configuration

In [2]:
result_file = 'E15.csv.gz'  # Where to read from
prec = [1, 0.1, 0.01, 0.001, 0.0001]  # Which percentiles to compute
search_size = 1000000  # How many molecules to same in random search
n_tests = 64

## Load in Results
They are produced in an earlier notebook

In [3]:
%%time
data = pd.read_csv(result_file)  # Remember to uncomment end later
print(f'Loaded {len(data)} entries')

Loaded 15547091 entries
CPU times: user 26.8 s, sys: 1.48 s, total: 28.3 s
Wall time: 28.4 s


Drop rows with missing IC50's, which occur if the molecule is not fully connected

In [4]:
data = data[~data.IC50_mpnn.isnull()]
print(f'Removed nulls. New size {len(data)}')

Removed nulls. New size 15538418


## Compute Top Percentiles
How good are the best molecules?

In [5]:
for r in ['IC50_mpnn', 'logP']:
    values = np.percentile(data[r], prec)

    with open(f'top_{r}.json', 'w') as fp:
        json.dump({
            'top_perc': prec,
            r: values.tolist(),
            'best': data[r].min()
        }, fp, indent=2)

## Simulate a Random Search
Simulate a random search by drawing compounds at random

In [6]:
def simulate_random_search(reward: str, n_tests: int, threshold: float):
    # Run the tests
    cummins = []
    below_thresh = []
    for _ in tqdm(range(n_tests)):
        choices = data.sample(search_size)
        cummins.append(choices[reward].cummin())
        below_thresh.append(np.cumsum(choices[reward] < threshold))
    cummins = np.vstack(cummins)
    below_thresh = np.vstack(below_thresh)

    # Save the mean and standard error for the cumulative min
    min_sem_val = sem(cummins, axis=0)
    min_mean = np.mean(cummins, axis=0)
    bz_sem_val = sem(below_thresh, axis=0)
    bz_mean = np.mean(below_thresh, axis=0)

    # WRite out the data file
    pd.DataFrame({
        'min_mean': min_mean,
        'min_sem': min_sem_val,
        'n_below_thresh_mean': bz_mean,
        'n_below_thresh_sem': bz_sem_val
    }).to_csv(f'random_search_perf_{reward}.csv')

In [7]:
for r, t in zip(['IC50_mpnn', 'logP'], [0, 5]):
    simulate_random_search(r, n_tests=n_tests, threshold=t)

100%|██████████| 64/64 [01:15<00:00,  1.18s/it]
100%|██████████| 64/64 [01:15<00:00,  1.18s/it]
