## Imports

In [3]:
import numpy as np
import pandas as pd
import random

from superpac. base import create_dataset, get_split_mmp_indices

## Load data

Gets pKi values for each MMP to obtain images, classes, and regression values along with <code>mmp_ixs</code> which gives the relationship between MMP number and indexing of the previous objects.

In [5]:
folder = "ims_frag_200_a"
suffix = "frag_200_a.npy"

im_height = 200
im_width = 600
num_augs = 4

! rm "../ims_frag_200_a/.DS_Store"
! rm "../ims_frag_200_a/0/.DS_Store"
! rm "../ims_frag_200_a/1/.DS_Store"


val_df = pd.read_csv("../smiles_and_labels/Y_mmps_vals.csv", header=None)
mmp_ixs, im_array, class_array, reg_values = create_dataset("../"+folder, im_height, im_width, val_df)

rm: ./+folder+/.DS_Store: No such file or directory


## Regular dataset

In [5]:
num_augs=4
ds_size = 15787 * num_augs
zero_out, one_out, two_out = get_split_mmp_indices("../ixs_for_splits", ds_size)

zero_ixs = []
one_ixs = []
two_ixs = []

for mmp_id in zero_out:
    ix_list = mmp_ixs[mmp_id]
    zero_ixs = zero_ixs + ix_list

for mmp_id in one_out:
    ix_list = mmp_ixs[mmp_id]
    one_ixs = one_ixs + ix_list
    
for mmp_id in two_out:
    ix_list = mmp_ixs[mmp_id]
    two_ixs = two_ixs + ix_list

In [8]:
ixs = {
    "train":zero_ixs,
    "one_out":one_ixs,
    "test":two_ixs
}

for x in ["train", "one_out", "test"]:
    x_ix = ixs[x]
    
    X = im_array[x_ix,:,:,:]
    y = class_array[x_ix]
    yr = reg_values[x_ix]
    
    np.save("../split_datasets/"+x+"_X_"+suffix, X)
    np.save("../split_datasets/"+x+"_y_"+suffix, y)
    np.save("../split_datasets/"+x+"_yr_"+suffix, yr)
    del X, y, yr

## Undersampled dataset

In [None]:
train_X = np.load("../split_datasets/train_X_frag_200.npy")
train_y = np.load("../split_datasets/train_y_frag_200.npy")
test_X = np.load("../split_datasets/test_X_frag_200.npy")
test_y = np.load("../split_datasets/test_y_frag_200.npy")


### Relationship between ejection probability and class imbalance

In [None]:
# Count ACs in unchanged set
s=0
for c,v in enumerate(test_y):
   if v==1: s+=1
s

In [None]:
# AC proportion : Eject probability [given 741 out of 7089 ACs in train; ]
# 0.104 : 0
# 0.2 : 0.533
# 0.3 : 0.728
# 0.4 : 0.825
# 0.5 : 0.883

# Want to impose this ejection probability and maintain 80/20 train/test split.
# 
eject_p_dict = {
    0.2:0.533,
    0.3:0.728,
    0.4:0.825,
    0.5:0.883,
}
for x in eject_p_dict:
    print(x)

### Subsampling

In [None]:
for t in eject_p_dict:
    suffix = str(int(10*t))
    
    print(suffix)
    # Keep positives, keep negatives with probability (1 - eject_p)
    eject_p = eject_p_dict[t]
    keep_ix_train = []
    keep_ix_test = []
    for c,v in enumerate(train_y):
        if v==0:
            r = random.random()
            if r > eject_p : keep_ix_train.append(c)
        else: keep_ix_train.append(c)
    for c,v in enumerate(test_y):
        if v==0:
            r = random.random()
            if r > eject_p : keep_ix_test.append(c)
        else: keep_ix_test.append(c)

    # Save as _sub_(pos frequency)
    train_X_sample, train_y_sample = train_X[keep_ix_train], train_y[keep_ix_train]
    np.save("../data_split/train_X_sub_"+suffix+".npy", train_X_sample)
    np.save("../data_split/train_y_sub_"+suffix+".npy", train_y_sample)
    del train_X_sample, train_y_sample
    test_X_sample, test_y_sample  = test_X[keep_ix_test], test_y[keep_ix_test]  
    np.save("../data_split/test_X_sub_"+suffix+".npy", test_X_sample)
    np.save("../data_split/test_y_sub_"+suffix+".npy", test_y_sample)
    del test_X_sample, test_y_sample