# Experiment 2 Part 1: Is there any label leakage? 

- dataset label was changed to be from a uniform binary distribution. 
- strategy was fixed to: ClusterBasedWCSelector_201
- once the strategy selects the next iteration, it is added to the training data with the TRUE labels. 
- this was done to check if the strategy selects a different set of cpds when compared to the true label. It should not, since the strategy should be label-agnostic to the cpds in the unlabeled pool.

In [70]:
import pandas as pd
import numpy as np
import glob

col_name = 'Index ID'
task_name = 'pcba-aid624173'

df1 = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/aid624173_cv_96/unlabeled_*.csv')])
df2 = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/aid624173_cv_96_scrambled/unlabeled_*.csv')])

orig_total_actives = df1[task_name].sum()
scrambled_total_actives = df2[task_name].sum()

del df1, df2

orig_dir = '../params_results_exp_2/original/1179/batch_size_96/training_data/iter_{}.csv'
scrambled_dir = '../params_results_exp_2/scrambled/top_1536/ClusterBasedWCSelector_201/1179/batch_size_96/training_data_scrambled/iter_{}.csv'

orig_sum = 0
scrambled_sum = 0
for i in range(7):
    orig_df = pd.read_csv(orig_dir.format(i))
    scrambled_df = pd.read_csv(scrambled_dir.format(i))
    
    orig_sum += orig_df[task_name].sum()
    scrambled_sum += scrambled_df[task_name].sum()
    print('iter_{}. Index equality: {}. Label equality: {}'.format(i, 
                                                                   scrambled_df[col_name].equals(orig_df[col_name]), 
                                                                   scrambled_df[task_name].equals(orig_df[task_name])))

    if i > 0:
        assert scrambled_df[col_name].equals(orig_df[col_name]) and not scrambled_df[task_name].equals(orig_df[task_name])

    print('Orig sum: {}. Scrambled sum: {}.'.format(orig_sum, scrambled_sum))

print('Orig Total Actives: {}. Scrambled Total Actives:{}'.format(orig_total_actives, scrambled_total_actives))

iter_0. Index equality: True. Label equality: True
Orig sum: 1.0. Scrambled sum: 1.0.
iter_1. Index equality: True. Label equality: False
Orig sum: 35.0. Scrambled sum: 47.0.
iter_2. Index equality: True. Label equality: False
Orig sum: 77.0. Scrambled sum: 100.0.
iter_3. Index equality: True. Label equality: False
Orig sum: 90.0. Scrambled sum: 153.0.
iter_4. Index equality: True. Label equality: False
Orig sum: 110.0. Scrambled sum: 203.0.
iter_5. Index equality: True. Label equality: False
Orig sum: 126.0. Scrambled sum: 248.0.
iter_6. Index equality: True. Label equality: False
Orig sum: 137.0. Scrambled sum: 298.0.
Orig Total Actives: 487.0. Scrambled Total Actives:200353


---

# Experiment 2 Part 2: How well does explorative strategies perform on  random uniform label set?

- dataset label was changed to be from a uniform binary distribution. 
- strategy was fixed to: ClusterBasedWCSelector_201
- once the strategy selects the next iteration, it is added to the training data with the SCRAMBLED labels. 
- this was done to check if the strategy performs better than random. It should not as there is no structure in the feature-to-labelling scheme. 

In [74]:
import pandas as pd
import numpy as np
import glob

col_name = 'Index ID'
task_name = 'pcba-aid624173'

#df1 = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/aid624173_cv_96/unlabeled_*.csv')])
#df2 = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/aid624173_cv_96_scrambled/unlabeled_*.csv')])

#orig_total_actives = df1[task_name].sum()
#scrambled_total_actives = df2[task_name].sum()

#del df1, df2

orig_dir = '../params_results_exp_2/original/1179/batch_size_96/training_data/iter_{}.csv'
scrambled_dir = '../params_results_exp_2/scrambled/top_1536/ClusterBasedWCSelector_201/1179/batch_size_96/training_data_uniform/iter_{}.csv'

orig_sum = 0
scrambled_sum = 0
for i in range(7):
    orig_df = pd.read_csv(orig_dir.format(i))
    scrambled_df = pd.read_csv(scrambled_dir.format(i))
    
    orig_sum += orig_df[task_name].sum()
    scrambled_sum += scrambled_df[task_name].sum()
    print('iter_{}. Index equality: {}. Label equality: {}'.format(i, 
                                                                   scrambled_df[col_name].equals(orig_df[col_name]), 
                                                                   scrambled_df[task_name].equals(orig_df[task_name])))

    print('Orig sum: {}. Uniform sum: {}.'.format(orig_sum, scrambled_sum))

print('Orig Total Actives: {}. Uniform Total Actives:{}'.format(orig_total_actives, scrambled_total_actives))

iter_0. Index equality: True. Label equality: True
Orig sum: 1.0. Uniform sum: 1.0.
iter_1. Index equality: True. Label equality: False
Orig sum: 35.0. Uniform sum: 47.0.
iter_2. Index equality: False. Label equality: False
Orig sum: 77.0. Uniform sum: 89.0.
iter_3. Index equality: False. Label equality: False
Orig sum: 90.0. Uniform sum: 140.0.
iter_4. Index equality: False. Label equality: False
Orig sum: 110.0. Uniform sum: 196.0.
iter_5. Index equality: False. Label equality: False
Orig sum: 126.0. Uniform sum: 244.0.
iter_6. Index equality: False. Label equality: False
Orig sum: 137.0. Uniform sum: 296.0.
Orig Total Actives: 487.0. Uniform Total Actives:200353


---

# Experiment 2 Part 3: How well does explorative strategies perform on a dataset with the same number of actives as a the true dataset (i.e. 487) but they are randomized to different cpds?

- the location of actives in the true dataset was randomly moved to other cpds. 
- strategy was fixed to: ClusterBasedWCSelector_201
- once the strategy selects the next iteration, it is added to the training data with the SCRAMBLED labels. 
- this was done to check if the strategy performs better than random. It should not as there is no structure in the feature-to-labelling scheme. 

In [17]:
import pandas as pd
import numpy as np
import glob

col_name = 'Index ID'
task_name = 'pcba-aid624173'

df1 = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/aid624173_cv_96/unlabeled_*.csv')])
df2 = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/aid624173_cv_96_randomize_actives/unlabeled_*.csv')])

orig_total_actives = df1[task_name].sum()
rand_total_actives = df2[task_name].sum()

del df1, df2

orig_dir = '../params_results_exp_2/original/1179/batch_size_96/training_data/iter_{}.csv'
scrambled_dir = '../params_results_exp_2/scrambled/top_1536/ClusterBasedWCSelector_201/1179/batch_size_96/training_data_randomized_actives/iter_{}.csv'

orig_sum = 0
rand_sum = 0
for i in range(7):
    orig_df = pd.read_csv(orig_dir.format(i))
    rand_df = pd.read_csv(scrambled_dir.format(i))
    
    orig_sum += orig_df[task_name].sum()
    rand_sum += rand_df[task_name].sum()
    print('iter_{}. Index equality: {}. Label equality: {}'.format(i, 
                                                                   rand_df[col_name].equals(orig_df[col_name]), 
                                                                   rand_df[task_name].equals(orig_df[task_name])))

    print('Orig sum: {}. Randomized sum: {}.'.format(orig_sum, rand_sum))

print('Orig Total Actives: {}. Randomized Total Actives:{}'.format(orig_total_actives, rand_total_actives))

iter_0. Index equality: True. Label equality: True
Orig sum: 1.0. Randomized sum: 1.0.
iter_1. Index equality: True. Label equality: False
Orig sum: 35.0. Randomized sum: 1.0.
iter_2. Index equality: False. Label equality: False
Orig sum: 77.0. Randomized sum: 1.0.
iter_3. Index equality: False. Label equality: False
Orig sum: 90.0. Randomized sum: 1.0.
iter_4. Index equality: False. Label equality: False
Orig sum: 110.0. Randomized sum: 1.0.
iter_5. Index equality: False. Label equality: False
Orig sum: 126.0. Randomized sum: 1.0.
iter_6. Index equality: False. Label equality: False
Orig sum: 137.0. Randomized sum: 1.0.
Orig Total Actives: 487.0. Randomized Total Actives:487


---
# Create new dataset

In [15]:
import pandas as pd
import numpy as np
import glob

col_name = 'Index ID'
task_name = 'pcba-aid624173'
odir = '../datasets/aid624173_cv_96/unlabeled_{}.csv'
new_dir = '../datasets/aid624173_cv_96_randomize_actives/unlabeled_{}.csv'
ofiles = glob.glob(odir.format('*'))

total_actives = sum([pd.read_csv(x)[task_name].sum() for x in ofiles])

actives_locations = np.random.choice(np.arange(len(ofiles)), size=int(total_actives), replace=False)

for i in range(len(ofiles)):
    of = odir.format(i)
    nf = new_dir.format(i)
    
    odf = pd.read_csv(of)
    odf[task_name] = 0
    
    if i in actives_locations:
        random_idx = np.random.choice(odf.shape[0], size=1, replace=False)[0]
        odf.loc[random_idx, task_name] = 1
    
    odf.to_csv(nf, index=False)
    
    assert pd.read_csv(nf)[col_name].equals(pd.read_csv(of)[col_name]) and not pd.read_csv(of)[task_name].equals(pd.read_csv(nf)[task_name])