# Assess Search
Get some statistics from brute-forcing the search, such as the expected performance of random sampling

In [1]:
from scipy.stats import sem
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

Configuration

In [2]:
result_file = 'E15.csv.gz'  # Where to read from
perc = np.array([99, 99.9, 99.99, 99.999, 99.9999])  # Which percentiles to compute
search_size = 1000000  # How many molecules to same in random search
n_tests = 64

## Load in Results
They are produced in an earlier notebook

In [3]:
%%time
data = pd.read_csv(result_file)  # Remember to uncomment end later
print(f'Loaded {len(data)} entries')

Loaded 15547091 entries
CPU times: user 33.9 s, sys: 2.11 s, total: 36 s
Wall time: 36 s


Drop rows with missing IC50's, which occur if the molecule is not fully connected

In [4]:
data = data[~data.pIC50_mpnn.isnull()]
print(f'Removed nulls. New size {len(data)}')

Removed nulls. New size 15538418


## Compute Top Percentiles
How good are the best molecules?

In [5]:
for r, is_max in zip(['pIC50_mpnn', 'logP'], [True, False]):
    my_perc = perc if is_max else 100 - perc
    values = np.percentile(data[r], my_perc)

    with open(f'top_{r}.json', 'w') as fp:
        json.dump({
            'percentiles': my_perc.tolist(),
            r: values.tolist(),
            'best': data[r].max() if is_max else data[r].min(),
            'maximize': is_max
        }, fp, indent=2)

## Simulate a Random Search
Simulate a random search by drawing compounds at random

In [6]:
def simulate_random_search(reward: str, n_tests: int, threshold: float, maximize: bool):
    # Run the tests
    cum_best = []
    past_thresh = []
    for _ in tqdm(range(n_tests)):
        choices = data.sample(search_size)
        cum_best.append(choices[reward].cummax() if maximize else choices[reward].cummin())
        t = choices[reward] > threshold if maximize else choices[reward] < threshold
        past_thresh.append(np.cumsum(t))
    cum_best = np.vstack(cum_best)
    past_tresh = np.vstack(past_thresh)

    # Save the mean and standard error for the cumulative min
    best_sem = sem(cum_best, axis=0)
    best_mean = np.mean(cum_best, axis=0)
    thresh_sem = sem(past_thresh, axis=0)
    thresh_mean = np.mean(past_thresh, axis=0)

    # WRite out the data file
    pd.DataFrame({
        'best_mean': best_mean,
        'best_sem': best_sem,
        'past_thresh_mean': thresh_mean,
        'past_thresh_sem': thresh_sem
    }).to_csv(f'random_search_perf_{reward}.csv')

In [None]:
for r, t, m in zip(['pIC50_mpnn', 'logP'], [8, 5], [True, False]):
    simulate_random_search(r, n_tests=n_tests, threshold=t, maximize=m)

100%|██████████| 64/64 [01:15<00:00,  1.17s/it]
100%|██████████| 64/64 [01:14<00:00,  1.16s/it]
