# Data Generator

This notebook generates the data for all the datasets.

In [1]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
from pathlib import Path

BASE = 'mil/datasets/csvs'

def dataset_generator(dataset, get_label, prefix, min_size=3, max_size=7, name='Bags'):
  (xi_train, yi_train), (xi_test, yi_test) = dataset.load_data()
  
  xb_train, yb_train = bag_generator(xi_train, yi_train, get_label, min_size, max_size)
  xb_test, yb_test = bag_generator(xi_test, yi_test, get_label, min_size, max_size)  
  
  train = f'train: pos={yb_train.sum()}, neg={(yb_train == 0).sum()}, %={yb_train.sum() / yb_train.shape[0] * 100:.2f}'
  test = f'test : pos={yb_test.sum()}, neg={(yb_test == 0).sum()}, %={yb_test.sum() / yb_test.shape[0] * 100:.2f}'
  print(f'{name:<15} - {train}, {test}')

  save(xb_train, yb_train, prefix + '_train.csv')
  save(xb_test, yb_test, prefix + '_test.csv')
  

def save(x, y, filename):
  Path(filename).parent.mkdir(parents=True, exist_ok=True)
  data = np.concatenate([y[:, np.newaxis], x], axis=1)
  cols = [ 'y' ] + [ f'x{i}' for i in range(x.shape[1])]
  
  df = pd.DataFrame(data, columns=cols)
  df.to_csv(filename, index=False)
  
def pad(arr, max_size):
  p = np.empty(max_size - arr.shape[0])
  p.fill(-1)
  return np.concatenate([arr, p])

def bag_generator(x, y, get_label, min_size, max_size):
  x_shuf, y_shuf = sklearn.utils.shuffle(np.arange(x.shape[0]), y, random_state=42)
  x_bags, y_bags = [], []
  
  i = 0
  
  rng = np.random.default_rng(42)
  
  while (x_shuf.shape[0] - i) > max_size:
    size = rng.integers(min_size, max_size + 1)
    
    x_bags.append(pad(x_shuf[i:i + size], max_size))
    y_bags.append(get_label(y_shuf[i: i + size]))
    i += size
    
  x_bags.append(pad(x_shuf[i:], max_size))
  y_bags.append(get_label(y_shuf[i:]))
  
  return np.array(x_bags).astype(int), np.array(y_bags).astype(int)

def contains_class(class_id):
  def contains(labels):
    return np.any(np.array(labels) == class_id)
  return contains

def contains_classes(class_ids):
  def contains(labels):
    return np.any(np.isin(class_ids, labels))
  return contains

def doesnt_contain_classes(class_ids):
  def doesnt_contain(labels):
    return np.all(~np.isin(class_ids, labels))
  return doesnt_contain

In [2]:
for i in range(10):
  print(f'---------- Class {i} ----------')
  labeler = contains_class(i)
  dataset_generator(tf.keras.datasets.mnist, labeler, f'{BASE}/standard/mnist/{i}', name='MNIST')
  dataset_generator(tf.keras.datasets.fashion_mnist, labeler, f'{BASE}/standard/fashion/{i}', name='Fashion')
  dataset_generator(tf.keras.datasets.cifar10, labeler, f'{BASE}/standard/cifar10/{i}', name='CIFAR10')

---------- Class 0 ----------
MNIST           - train: pos=4794, neg=7238, %=39.84, test : pos=803, neg=1199, %=40.11
Fashion         - train: pos=4824, neg=7208, %=40.09, test : pos=809, neg=1193, %=40.41
CIFAR10         - train: pos=4049, neg=5987, %=40.34, test : pos=805, neg=1197, %=40.21
---------- Class 1 ----------
MNIST           - train: pos=5308, neg=6724, %=44.12, test : pos=878, neg=1124, %=43.86
Fashion         - train: pos=4845, neg=7187, %=40.27, test : pos=806, neg=1196, %=40.26
CIFAR10         - train: pos=4062, neg=5974, %=40.47, test : pos=792, neg=1210, %=39.56
---------- Class 2 ----------
MNIST           - train: pos=4838, neg=7194, %=40.21, test : pos=823, neg=1179, %=41.11
Fashion         - train: pos=4812, neg=7220, %=39.99, test : pos=812, neg=1190, %=40.56
CIFAR10         - train: pos=4001, neg=6035, %=39.87, test : pos=782, neg=1220, %=39.06
---------- Class 3 ----------
MNIST           - train: pos=4885, neg=7147, %=40.60, test : pos=804, neg=1198, %=40.16


In [3]:
for i in range(9):
  print(f'---------- Classes {i}/{i + 1} ----------')
  labeler = contains_classes([ i, i + 1 ])
  dataset_generator(tf.keras.datasets.mnist, labeler, f'{BASE}/presence/mnist/{i}{i + 1}', name='MNIST')
  dataset_generator(tf.keras.datasets.fashion_mnist, labeler, f'{BASE}/presence/fashion/{i}{i + 1}', name='Fashion')
  dataset_generator(tf.keras.datasets.cifar10, labeler, f'{BASE}/presence/cifar10/{i}{i + 1}', name='CIFAR10')

---------- Classes 0/1 ----------
MNIST           - train: pos=8165, neg=3867, %=67.86, test : pos=1351, neg=651, %=67.48
Fashion         - train: pos=7889, neg=4143, %=65.57, test : pos=1310, neg=692, %=65.43
CIFAR10         - train: pos=6584, neg=3452, %=65.60, test : pos=1297, neg=705, %=64.79
---------- Classes 1/2 ----------
MNIST           - train: pos=8178, neg=3854, %=67.97, test : pos=1379, neg=623, %=68.88
Fashion         - train: pos=7866, neg=4166, %=65.38, test : pos=1325, neg=677, %=66.18
CIFAR10         - train: pos=6582, neg=3454, %=65.58, test : pos=1294, neg=708, %=64.64
---------- Classes 2/3 ----------
MNIST           - train: pos=7891, neg=4141, %=65.58, test : pos=1298, neg=704, %=64.84
Fashion         - train: pos=7891, neg=4141, %=65.58, test : pos=1326, neg=676, %=66.23
CIFAR10         - train: pos=6583, neg=3453, %=65.59, test : pos=1291, neg=711, %=64.49
---------- Classes 3/4 ----------
MNIST           - train: pos=7841, neg=4191, %=65.17, test : pos=1298, n

In [4]:
for i in range(9):
  print(f'---------- Classes {i}/{i + 1} ----------')
  labeler = doesnt_contain_classes([ i, i + 1 ])
  dataset_generator(tf.keras.datasets.mnist, labeler, f'{BASE}/absence/mnist/{i}{i + 1}', name='MNIST')
  dataset_generator(tf.keras.datasets.fashion_mnist, labeler, f'{BASE}/absence/fashion/{i}{i + 1}', name='Fashion')
  dataset_generator(tf.keras.datasets.cifar10, labeler, f'{BASE}/absence/cifar10/{i}{i + 1}', name='CIFAR10')

---------- Classes 0/1 ----------
MNIST           - train: pos=3867, neg=8165, %=32.14, test : pos=651, neg=1351, %=32.52
Fashion         - train: pos=4143, neg=7889, %=34.43, test : pos=692, neg=1310, %=34.57
CIFAR10         - train: pos=3452, neg=6584, %=34.40, test : pos=705, neg=1297, %=35.21
---------- Classes 1/2 ----------
MNIST           - train: pos=3854, neg=8178, %=32.03, test : pos=623, neg=1379, %=31.12
Fashion         - train: pos=4166, neg=7866, %=34.62, test : pos=677, neg=1325, %=33.82
CIFAR10         - train: pos=3454, neg=6582, %=34.42, test : pos=708, neg=1294, %=35.36
---------- Classes 2/3 ----------
MNIST           - train: pos=4141, neg=7891, %=34.42, test : pos=704, neg=1298, %=35.16
Fashion         - train: pos=4141, neg=7891, %=34.42, test : pos=676, neg=1326, %=33.77
CIFAR10         - train: pos=3453, neg=6583, %=34.41, test : pos=711, neg=1291, %=35.51
---------- Classes 3/4 ----------
MNIST           - train: pos=4191, neg=7841, %=34.83, test : pos=704, ne

In [5]:
from enum import Enum

class Type(Enum):
  TSHIRT = 0
  TROUSER = 1
  PULLOVER = 2
  DRESS = 3
  COAT = 4
  SANDAL = 5
  SHIRT = 6
  SNEAKER = 7
  BAG = 8
  ANKLE_BOOT = 9

def any_in(labels, class_ids):
  return np.any(np.isin(labels, [ c.value for c in class_ids ]))

def simple_outfit(labels):
  return (
    any_in(labels, [ Type.TSHIRT, Type.PULLOVER, Type.DRESS, Type.COAT, Type.SHIRT ]) and
    any_in(labels, [ Type.TROUSER ]) and
    any_in(labels, [ Type.SANDAL, Type.SNEAKER, Type.ANKLE_BOOT ])
  )

def complex_outfit(labels):
  return (
    (
      any_in(labels, [ Type.TSHIRT, Type.SHIRT ]) and
      any_in(labels, [ Type.TROUSER ]) and
      any_in(labels, [ Type.SNEAKER, Type.ANKLE_BOOT ])
    ) or
    (
      any_in(labels, [ Type.DRESS ]) and
      any_in(labels, [ Type.BAG ]) and
      any_in(labels, [ Type.SANDAL, Type.ANKLE_BOOT ])
    )
  )

dataset_generator(tf.keras.datasets.fashion_mnist, simple_outfit, f'{BASE}/complex/fashion/basic-outfit', name='Basic')
dataset_generator(tf.keras.datasets.fashion_mnist, complex_outfit, f'{BASE}/complex/fashion/multi-outfit', name='Multi')

Basic           - train: pos=3413, neg=8619, %=28.37, test : pos=574, neg=1428, %=28.67
Multi           - train: pos=2725, neg=9307, %=22.65, test : pos=447, neg=1555, %=22.33
