In order for all algorithms to have the same data, the queries will be produced independent of the algorithm and used for all of them.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
DATA_PATH = os.path.expanduser('~/Documents/datasets/mulan')
DATASETS = ['birds',
            'birds-test',
            'birds-train',
            'CAL500',
            'emotions',
            'emotions-test',
            'emotions-train',
            'mediamill',
            'mediamill-test',
            'mediamill-train',
            'yeast',
            'yeast-test',
            'yeast-train']
CLASSES = ['birds', 'CAL500', 'emotions', 'mediamill', 'yeast']

In [3]:
SAMPLES_PER_ATTRIBUTE = 10 # how many queries will be produced per attribute
MINIMUM_EXAMPLE_COUNT = 6 # ignores attribute if there's less than this amount of examples
RATIO_OF_SAMPLE = 0.4 # 40% of sample size becomes query and 60% are valid targets
MAXIMUM_SAMPLE_SIZE = 12 # maximum amount of examples used for querying

In [4]:
# set seed for reproducibility - reset it if rerunning the shuffling
np.random.seed(42)

In [5]:
def merge(targets):
    if len(targets) > 1:
        ret = set()
        for x in targets:
            ret = ret.union(set(x))
        return ','.join(sorted(ret))
    return ','.join(sorted(targets.iloc[0]))

In [6]:
for ds in DATASETS:
    df_Y = pd.read_csv(os.path.join(DATA_PATH, ds + '_Y.csv'))
    queries = []
    targets = []
    
    for attr in df_Y.columns:
        valid_indices = np.nonzero(df_Y[attr])[0]
        size = len(valid_indices)
        if size >= MINIMUM_EXAMPLE_COUNT:
            sample_size = min(int(size * RATIO_OF_SAMPLE), 12)
            for _ in range(SAMPLES_PER_ATTRIBUTE):
                np.random.shuffle(valid_indices)
                query = ','.join(map(str, sorted(valid_indices[:sample_size])))
                target = ','.join(map(str, sorted(valid_indices[sample_size:])))
                queries.append(query)
                targets.append(target)
    
    df_query = pd.DataFrame({'query': queries, 'target': targets}).drop_duplicates()
    
    unique_queries = len(df_query['query']) == df_query['query'].nunique()
    
    if not unique_queries:
        df_query = df_query.groupby(by='query').aggregate(merge).reset_index()
       
    unique_queries = len(df_query['query']) == df_query['query'].nunique() 
    assert unique_queries
    
    df_query.to_csv(os.path.join(DATA_PATH, ds + '_query.csv'), index=False)