In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
from ucimlrepo import fetch_ucirepo 
from pprint import pprint
from tqdm import tqdm
from mpmath import mp

mp.dps = 50

def hexlen(y):
    hexstr = hex(y)
    return len(hexstr)-2

def declen(y):
    h = hexlen(y)
    return int(np.floor(h*np.log10(16)))

assert hexlen(16) == 2
assert hexlen(0xABCDEF01) == 8
assert declen(9999) <= 4
assert declen(10000) <= 5


# Upper Bound (Impossibility)

For $\alpha$-error detection to be possible we must have
\begin{align}
  N &\ge \frac{\log 2\alpha}{2} + \sqrt{\frac{(\log 2\alpha)^2}{4} + (\beta |\mathcal X| - 1)\log\frac{1}{2\alpha} }
\end{align}
That is,
\begin{align}
  N &\ge \frac{\log 2\alpha}{2} + \sqrt{\frac{(\log 2\alpha)^2}{4} + \beta |\mathcal X|\log\frac{1}{2\alpha} - \log\frac{1}{2\alpha} } \\
    &= c_1 + \sqrt{c_1^2 + c_2 |\mathcal X| - c_3}
\end{align}
with $c_1 = \frac{\log2\alpha}{2}$, $c_2 = \beta \log\frac{1}{2\alpha}$ and $c_3 = \log\frac{1}{2\alpha}$.


In [2]:
alpha = mp.mpf("0.1")
beta = mp.mpf("0.001")
print(f"alpha={alpha}, beta={beta}")
c1 = mp.ln(2*alpha)/2
c12 = c1**2
c2 = beta * mp.ln(1/(2*alpha))
c3 = mp.ln(1/(2*alpha))
print(f'c1 = {c1}')
print(f'c1^2 = {c12}')
print(f'c2 = {c2}')
print(f'c3 = {c3}')


alpha=0.1, beta=0.001
c1 = -0.80471895621705018730037966661309381976280067713426
c1^2 = 0.64757259849505873629504275124816372384811354164986
c2 = 0.0016094379124341003746007593332261876395256013542685
c3 = 1.6094379124341003746007593332261876395256013542685


In [3]:
all_datasets = []

dt = {'Dataset': 'B/W MNIST',
      'H': 28,
      'W': 28,
      'C': 1,
      'P': 2,
      }
all_datasets.append(dt)
dt = {'Dataset': 'B/W SMALL MNIST',
      'H': 8,
      'W': 8,
      'C': 1,
      'P': 2,
      }
#all_datasets.append(dt)
dt = {'Dataset': 'MNIST',
      'H': 28,
      'W': 28,
      'C': 1,
      'P': 256,
     }
all_datasets.append(dt)
dt = {'Dataset': 'Lisa Traffic Sign',
      'W': 640,
      'H': 480,
      'C': 1,
      'P': 256,
     }
all_datasets.append(dt)
dt = {'Dataset': 'ImageNet', # also: MS-Celeb-1M, MS COCO, VGGFace
      'W': 224,
      'H': 224,
      'C': 3,
      'P': 256,
     }
all_datasets.append(dt)
dt = {'Dataset': 'CIFAR10', # also: GTSRB
      'H': 32,
      'W': 32,
      'C': 3,
      'P': 256,
     }
all_datasets.append(dt)


In [4]:
UCI_DATASETS =  [
    'Iris',
    'Heart Disease',
    'Adult',
    #'Dry Bean Dataset',
    #'Wine',
    #'Breast Cancer Wisconsin (Diagnostic)',
    #'Car Evaluation',
    #'Rice (Cammeo and Osmancik)',
    #'Mushroom',
]

In [5]:
for dataset_name in tqdm(UCI_DATASETS):
      
    # fetch dataset 
    dataset = fetch_ucirepo(name=dataset_name) 
      
    # data (as pandas dataframes) 
    X = dataset.data.features 
    y = dataset.data.targets 
    
    name = dataset.metadata['name'] 

    categories = []
    for data in (X, y):
        for column in data:
            category = len(data[column].unique())
            categories.append(category)
            #print(f"{column}: {category}")

    total_size = 1
    for c in categories:
        total_size = total_size * c
   
    dt = {'Dataset': name, # also: GTSRB
        'K': total_size,
    }
    
    all_datasets.append(dt)
    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.87s/it]


In [6]:
hxw = r"$H \times W$"
K_name = r"$|\mathcal{X}|$"
for dataset in tqdm(all_datasets):
    K = dataset.get('K')
    if K is None:
        C = dataset['C']
        P = dataset['P']
        H = dataset['H']
        W = dataset['W']
        dataset[hxw] = f"${H!s} \\times {W!s}$"
        K = mp.mpf(P**(H*W*C))
        dataset[K_name] = f"${P}^{{{H*W*C}}}$"
    else:
        K = mp.mpf(K)
        dataset[K_name] = f"$\\ge 10^{{{mp.floor(mp.log10(K)*100)/100}}}$"
    boundN = c1 + mp.sqrt(c12 + K*c2 - c3)
    #dataset['N Bound'] = boundN
    boundNlen = mp.log10(boundN)
    dataset['N_exp'] = boundNlen
    dataset['N'] = f"$ \\ge 10^{{{int(mp.floor(boundNlen))}}}$"

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:13<00:00,  1.67s/it]


In [7]:
df = pd.DataFrame(all_datasets)
df = df.sort_values("N_exp", axis='index', ascending=False)

In [8]:
with open('./datasets_df_values.tex', 'w') as fp:
    fp.write(f"\\def\myalpha{{{alpha!s}}}\n")
    fp.write(f"\\def\mybeta{{{beta!s}}}\n")
    
with open('./datasets_df.tex', 'w') as fp:
    columns=['Dataset', K_name, 'N']
    df0 = df[columns]
    fp.write(df0.style.hide(axis="index").to_latex(hrules=True))
    
df

Unnamed: 0,Dataset,H,W,C,P,$H \times W$,$|\mathcal{X}|$,N_exp,N,K
2,Lisa Traffic Sign,480.0,640.0,1.0,256.0,$480 \times 640$,$256^{307200}$,369904.262009013838238104437599165121685032728...,$ \ge 10^{369904}$,
3,ImageNet,224.0,224.0,3.0,256.0,$224 \times 224$,$256^{150528}$,181252.376086344790971996837142062035725729347...,$ \ge 10^{181252}$,
4,CIFAR10,32.0,32.0,3.0,256.0,$32 \times 32$,$256^{3072}$,3697.65992383274648624850730008266070520851898...,$ \ge 10^{3697}$,
1,MNIST,28.0,28.0,1.0,256.0,$28 \times 28$,$256^{784}$,942.633403515990587652368935564100524226045191...,$ \ge 10^{942}$,
0,B/W MNIST,28.0,28.0,1.0,2.0,$28 \times 28$,$2^{784}$,116.607095414026187985869408440091658774132156...,$ \ge 10^{116}$,
7,Adult,,,,,,$\ge 10^{21.86}$,9.53502518664775202578858358460327090834513010...,$ \ge 10^{9}$,7300895730574137753600
6,Heart Disease,,,,,,$\ge 10^{13.51}$,5.36038124972994045613337413012115050908720645...,$ \ge 10^{5}$,32665651200000
5,Iris,,,,,,$\ge 10^{6.35}$,1.77688157426405519778988823509706261968124971...,$ \ge 10^{1}$,2284590


In [9]:
df

Unnamed: 0,Dataset,H,W,C,P,$H \times W$,$|\mathcal{X}|$,N_exp,N,K
2,Lisa Traffic Sign,480.0,640.0,1.0,256.0,$480 \times 640$,$256^{307200}$,369904.262009013838238104437599165121685032728...,$ \ge 10^{369904}$,
3,ImageNet,224.0,224.0,3.0,256.0,$224 \times 224$,$256^{150528}$,181252.376086344790971996837142062035725729347...,$ \ge 10^{181252}$,
4,CIFAR10,32.0,32.0,3.0,256.0,$32 \times 32$,$256^{3072}$,3697.65992383274648624850730008266070520851898...,$ \ge 10^{3697}$,
1,MNIST,28.0,28.0,1.0,256.0,$28 \times 28$,$256^{784}$,942.633403515990587652368935564100524226045191...,$ \ge 10^{942}$,
0,B/W MNIST,28.0,28.0,1.0,2.0,$28 \times 28$,$2^{784}$,116.607095414026187985869408440091658774132156...,$ \ge 10^{116}$,
7,Adult,,,,,,$\ge 10^{21.86}$,9.53502518664775202578858358460327090834513010...,$ \ge 10^{9}$,7300895730574137753600
6,Heart Disease,,,,,,$\ge 10^{13.51}$,5.36038124972994045613337413012115050908720645...,$ \ge 10^{5}$,32665651200000
5,Iris,,,,,,$\ge 10^{6.35}$,1.77688157426405519778988823509706261968124971...,$ \ge 10^{1}$,2284590
