In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import h5py

In [None]:
DELQSAR_ROOT = os.getcwd() + '/../'
sys.path += [DELQSAR_ROOT + '/../']
from del_qsar import models, featurizers, splitters
from del_qsar.enrichments import R_from_z, R_ranges

In [None]:
FINGERPRINTS_FILENAME = 'x_DD1S_CAIX_2048_bits_all_fps.h5'

RANDOM_SPLIT_MODEL_PATH = os.path.join(DELQSAR_ROOT, 'experiments', 'models', 
                                       'DD1S_CAIX', 'FP-FFNN','random_seed_0.torch')

In [None]:
df_data = pd.read_csv(os.path.join(DELQSAR_ROOT, 'experiments', 'datasets', 'DD1S_CAIX_QSAR.csv'))

In [None]:
exp_counts = np.array(df_data[['exp_tot']], dtype='int')
bead_counts = np.array(df_data[['beads_tot']], dtype='int')
exp_tot = np.sum(exp_counts, axis=0) # column sums
bead_tot = np.sum(bead_counts, axis=0)

In [None]:
R, R_lb, R_ub = R_ranges(np.squeeze(bead_counts), bead_tot[0], np.squeeze(exp_counts), exp_tot[0])

In [None]:
os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE'
hf = h5py.File(os.path.join(DELQSAR_ROOT, 'experiments', FINGERPRINTS_FILENAME), 'r')
x = np.array(hf['all_fps'])
INPUT_SIZE = x.shape[1]
hf.close()

In [None]:
SEED = 0
torch.manual_seed(SEED)

In [None]:
# random split model
BATCH_SIZE = 1024
LAYER_SIZES = [64, 64, 64]
DROPOUT = 0.1
model = models.MLP(INPUT_SIZE, [int(size) for size in LAYER_SIZES],
                        dropout=DROPOUT, torch_seed=SEED)
model.load_state_dict(torch.load(RANDOM_SPLIT_MODEL_PATH))
print(str(model))

In [None]:
DEVICE = None
if torch.cuda.is_available():
    DEVICE = 'cuda:0'
    model = model.to(DEVICE)

In [None]:
splitter = splitters.RandomSplitter()
train_slice, valid_slice, test_slice = splitter(None, df_data, seed=SEED)

In [None]:
# get highest calculated enrichments for test-set compounds
_R = list(R[test_slice].copy())
_R.sort(reverse=True)
top_five_calculated_enrichments = _R[:5]
top_five_calculated_enrichments

In [None]:
# get indices and calculated/predicted enrichments of test-set compounds with highest calculated enrichments
cpds_with_highest_calculated_enrichments = []
calculated_enrichments = []
predicted_enrichments = []

for i, val in enumerate(R[test_slice]):
    if val in top_five_calculated_enrichments:
        cpds_with_highest_calculated_enrichments.append(test_slice[i])
        calculated_enrichments.append(val)
        
predicted_enrichments = model.predict_on_x(x[cpds_with_highest_calculated_enrichments], device=DEVICE)

for i, cpd_idx in enumerate(cpds_with_highest_calculated_enrichments):
    print(f'cpd_idx:               {cpd_idx}')
    print(f'calculated enrichment: {calculated_enrichments[i]}')
    print(f'predicted enrichment:  {predicted_enrichments[i]}')
    print()

In [None]:
# chose compounds with indices 3857, 87394, 81920, 104264, 66578 (cpd_id's 3858, 87395, 81921, 104265, 66579)
# as representative outliers