### Show quantile estimation with SGD  "works"

1. Test datasets:
    - Mixsture of 5 Gaussians
    - Sample of 100k from no.1
    - Gausian $\mu = 2, \sigma = 18$
    - Gausian $\mu = 0, \sigma = 0.001$
    - Sample 100 from no.1
    - Sample 100 from no.3
    - Exponential $\lambda$
    - My weird distribution lol
    
4. Test times:
    - 10 $\times$ dataset generation
    - 10 $\times$ SGD ???
    - 10 $\times$ shuffle (does the order matter?)
    
2. Q value: 0.1, 0.3, 0.5, 0.9, 0.99 

3. Step size: 
    - $\alpha_k = 1$
    - $\alpha_k = \frac{2}{\sqrt{k}}$
    - $\alpha_k = \frac{0.002}{\sqrt{k}}$
    - ?
    - ?
    - Why's the Newton method helpful?
       
5. Calculate $E = |q_{batch} - q_{sgd}|$

6. What's a small value of $E$?

7. Investigate the effect of datasize N
    - Convergence rate with same $\alpha_k$?
    - Different Q values?
    - ?

In [1]:
%matplotlib inline

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell


import numpy as np
import matplotlib.pyplot as plt
import import_ipynb
from plt_quantile_comparison import plot_quantile_shuffles, plot_quantile_generations

importing Jupyter notebook from plt_quantile_comparison.ipynb


In [2]:
distro_list = ['mix_gau', 'gau 1', 'gau 2', 'exp', 'my_distro']

q_vals = [0.1, 0.3, 0.5, 0.9, 0.99]
N_g = 12 # N_generation
N_s = 10 # N_shuffle

N_q = len(q_vals)

In [3]:
# all of the inputs are in the form of list
# e.g. distro = [mix_gaussian], datasize = [100, 100000], stepsize = [2]
def get_n_comp(distro, datasize, stepsize):
    print ('get_compared_setting()', distro, datasize, stepsize)
    if len(distro) > 1:
        if len(datasize)!= 1 or len(stepsize)!=1:
            raise Exception('The number of comp_setting is incorrect')
        return len(distro)
    elif len(datasize) > 1:
        if len(stepsize)!=1:
            raise Exception('The number of comp_setting is incorrect')        
        return len(datasize)
    elif len(stepsize) > 1:
        print ('step size')
        return len(stepsize)
    raise Exception('The number of comp_setting is incorrect')

def generate_setting_lst(distro, datasize, q_lst, g_test, s_test):
    setting_lst = []
    if len(distro)>1:
        for dis in distro:
            setting_lst.append([dis, datasize[0], q_lst, g_test, s_test])
    else:
        for size in datasize:
            setting_lst.append([distro[0], size, q_lst, g_test, s_test])
    print (setting_lst)
    return setting_lst

In [4]:
generate_setting_lst([1, 2, 3, 4], [1], q_vals, False, False)

[[1, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False], [2, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False], [3, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False], [4, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False]]


[[1, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False],
 [2, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False],
 [3, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False],
 [4, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False]]

### Dataset and quantile generation

In [5]:
# distro_list = ['mix_gau', 'gau 1', 'gau 2', 'exp', 'my_distro']

def generate_single_dataset(distro, datasize):
    if distro == 'gau 1':
        return np.random.normal(2, 18, datasize)
    elif distro == 'gau 2':
        return np.random.normal(0, 0.001, datasize)
    elif distro == 'mix':
        sizes = np.array([0.3, 0.2, 0.1, 0.15, 0.25]) * datasize
        d1 = np.random.normal(2.0, 7.0, int(sizes[0]))
        d2 = np.random.normal(0, 3.7, int(sizes[1]))
        d3 = np.random.normal(-9, 7, int(sizes[2]))
        d4 = np.random.normal(5, 77, int(sizes[3]))
        d5 = np.random.normal(-7, 7, int(sizes[4]))
        mix_lst = np.append(d1, np.append(d2, np.append(d3, np.append(d4, d5))))
        np.random.shuffle(mix_lst)
        return mix_lst
    elif distro == 'exp':
        return np.random.exponential(scale=1, size=datasize)



In [6]:
# data = generate_single_dataset('gau 1', 1000)
# data2 = generate_single_dataset('gau 2', 1000)
# data_mix = generate_single_dataset('mix', 100000)
# data_exp = generate_single_dataset('exp', 1000)

# num_bins = 100
# plt.hist([data, data2], num_bins, alpha = 0.5, label=['a', 'b'])
# plt.hist(data_mix, num_bins, alpha = 0.5, label='mix')
# plt.hist(data_exp, num_bins, alpha = 0.5, label='mix')
# plt.legend()
# plt.show()

In [7]:
def generate_dataset(distro, datasize, g_test, s_test):
#     print ('generate_dataset', distro, datasize, g_test, s_test)
#     shape of dataset: 1*Datasize,
    if not g_test and not s_test:
        return generate_single_dataset(distro, datasize)
            
    elif g_test and not s_test:
        generated_dt = np.zeros((N_g, datasize))
        for i in range(N_g):
            generated_dt[i] = generate_single_dataset(distro, datasize)
        return generated_dt
    
    elif s_test and not g_test:
        shuffled_dt = np.zeros((N_s, datasize))
        dt = generate_single_dataset(distro, datasize)
        for i in range(N_s): 
            np.random.shuffle(dt)
            shuffled_dt[i] = dt
        return shuffled_dt
        
    dataset = np.zeros((N_g, N_s, datasize))
    for gen_id in range(N_g):
        dt = generate_single_dataset(distro, datasize)
        for shu_id in range(N_s):
#             print (gen_id, shu_id)
            np.random.shuffle(dt)
            dataset[gen_id][shu_id] = dt
    return dataset


In [8]:
def test_generate_dataset():
    distro = 'mix'
    datasize = 10000

    data_shuffle = generate_dataset(distro, datasize, False, True)
    data_gen = generate_dataset(distro, datasize, True, False)

    fig, (ax1, ax2) = plt.subplots(2,1,sharex=True)
    fig.set_size_inches(17, 12)
    num_bins = 100
    ax1.hist([dt for dt in data_shuffle[:3]], num_bins, alpha = 0.5, label=[str(i) for i in range(20)])
    ax1.legend()
    ax2.hist([dt for dt in data_gen[:3]], num_bins, alpha = 0.5, label=[str(i) for i in range(20)])
    ax2.legend()
    plt.show()
# test_generate_dataset()

In [9]:
def generate_q_batches_single_dataset(dataset, q_lst):
    q_batches = np.zeros(len(q_lst))
    for i in range(len(q_lst)):
        q_batches[i] = np.percentile(dataset, q_lst[i]*100)
    return q_batches


def generate_q_batches(dataset, q_lst, g_test, s_test):
#     print ('generate_q_batches', dataset, g_test, s_test)
    if not g_test:
        if s_test:
            return generate_q_batches_single_dataset(dataset[0], q_lst)
        else: 
            return generate_q_batches_single_dataset(dataset, q_lst)
    else:
        N_g = dataset.shape[0]
        q_batches = np.zeros((N_g, len(q_lst)))
        for i in range(N_g):
            if not s_test:
                q_batches[i] = generate_q_batches_single_dataset(dataset[i], q_lst)
            else:
                q_batches[i] = generate_q_batches_single_dataset(dataset[i][0], q_lst)
        return q_batches

In [10]:
def generate_reusable_data(distro, datasize, q_lst, g_test, s_test):
    print ('generate_reusable_data', (distro, datasize, q_lst, g_test, s_test))
    dataset = generate_dataset(distro, datasize, g_test, s_test)
    q_batches = generate_q_batches(dataset, q_lst, g_test, s_test)
    return dataset, q_batches

def generate_q_sgds(dataset, stepsize, q_lst, g_test, s_test):
#     q_sgd_proc = generate_q_sgd_proc(dataset, stepsize, q_lst, g_test, s_test)
#     q_sgd_res  = generate_q_sgd_res(q_sgd_proc)
#     return q_sgd_res, q_sgd_proc 
    return 0, 1

In [11]:
dataset, q_batches = generate_reusable_data('exp', 1000000, q_vals, True, True)
print(dataset.shape, '\n',
      dataset.mean(), dataset.std(), np.median(dataset))
print(q_batches)

generate_reusable_data ('exp', 1000000, [0.1, 0.3, 0.5, 0.9, 0.99], True, True)
(12, 10, 1000000) 
 0.9999034936378891 1.0002577204535932 0.6928631958092716
[[0.10547028 0.3572514  0.69495467 2.30310341 4.60234149]
 [0.10567014 0.35626851 0.69251095 2.30341015 4.60100511]
 [0.10542288 0.35586933 0.6921345  2.29817475 4.60765051]
 [0.10541072 0.35694838 0.69268425 2.30387933 4.62174621]
 [0.10504822 0.35628694 0.6919059  2.30136075 4.5826878 ]
 [0.10565782 0.35598138 0.69180675 2.30430292 4.60398364]
 [0.10539599 0.35656933 0.69282979 2.30464898 4.6027081 ]
 [0.10574974 0.35673494 0.69266046 2.30045767 4.61739305]
 [0.1052071  0.35710159 0.69368236 2.30254107 4.59188449]
 [0.10570816 0.35670704 0.69321639 2.30363613 4.61720431]
 [0.10546119 0.35637999 0.69282301 2.30617245 4.59837913]
 [0.10509111 0.35669309 0.69340076 2.30512209 4.6217217 ]]


### Plots generation

In [12]:
def draw_charts(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, charts):
#     return plts
    return 4

def draw_tables(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, charts):
#     return tbls
    return 5

In [28]:
# Conduct experiments on quantile estimation with SGD with regards to different aspects
# Experiment results are shown in charts and tables
# 
# Return: None? or charts and figures
#
# Inputs: 
#     1. Parameters for the experiments
#         - Settings to change: one of {distro, datasize, stepsize} (all elements are lists/dict)
#         - Settings to control: the left two of {distro, datasize, stepsize}
#                                g_test, s_test
#     2. Values to show in the results
#         - q_values to be compared
#         - charts to be showen in {plt_res, plt_proc, plt_e}
#         - tables to be shown in {tbl_res, tbl_proc}

def quantile_sgd_compare(distro, datasize, stepsize, 
                            g_test=False, s_test=False, q_lst=q_vals, 
                           charts={'plt_res', 'plt_proc', 'plt_e'}, tables={'tbl_res, tbl_proc'}):
    dataset, q_batches = 0, 0
    N_comp = get_n_comp(distro, datasize, stepsize)
    
    print (N_comp)
    
    if len(stepsize)==1:
        setting_lst = generate_setting_lst(distro, datasize, q_lst, g_test, s_test)
    else:
        dataset, q_batches = generate_reusable_data(distro[0], datasize[0], q_lst, g_test, s_test)
        print (dataset.shape, q_batches)

    # for each round, generate all data and comparison results
    for i in range(N_comp):
        if len(stepsize)==1:
            print ("generate the {}th dataset and q_batches".format(i))
            lst = setting_lst[i]
            distro, datasize, q_lst, g_test, s_test = lst[0], lst[1], lst[2], lst[3], lst[4]
            dataset, q_batches = generate_reusable_data(distro, datasize, q_lst, g_test, s_test)
            print (dataset.shape, q_batches)
            q_sgd_res, q_sgd_proc = generate_q_sgds(dataset, stepsize, q_lst, g_test, s_test)
        else:
            print ('different q_sgd', i)
            q_sgd_res, q_sgd_proc = generate_q_sgds(dataset, stepsize[i], q_lst, g_test, s_test)
        plts = draw_charts(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, charts)
        tbls = draw_tables(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, tables)
    # have them together?
    
    return
    

In [29]:
quantile_sgd_compare(['gau 1'], [100], [2, 3, 4], True)

get_compared_setting() ['gau 1'] [100] [2, 3, 4]
step size
3
generate_reusable_data ('gau 1', 100, [0.1, 0.3, 0.5, 0.9, 0.99], True, False)
(12, 100) [[-23.62333892  -9.19081682  -0.79794357  18.91473365  37.97894605]
 [-24.13639435  -6.97242004  -0.85398578  18.20687468  41.45425806]
 [-15.90165152  -6.05168069   2.08176721  26.82313176  42.84113451]
 [-19.10568246  -7.16856801   2.80144182  23.41060872  35.9572    ]
 [-19.72066394  -8.11831581   3.88529659  29.36675737  46.28148199]
 [-19.86753775  -9.86769591   0.65146582  22.76833742  29.78445232]
 [-16.94340811  -7.50098772  -0.12258783  22.88168449  41.72763182]
 [-23.39625436  -5.00641602   4.28856299  29.74279972  38.83326316]
 [-19.97911495  -8.13402831   1.54195416  24.8938768   32.72577811]
 [-20.07108511  -6.22554944  -0.55848903  28.97152843  43.4278944 ]
 [-18.99926039  -7.06994621  -0.24780702  21.04037702  39.2583893 ]
 [-20.28833445  -4.82197652   2.41053216  24.53318208  35.41429514]]
different q_sgd 0
different q_sgd

In [None]:
# plt: errorbar
# https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/errorbar_subsample.html#sphx-glr-gallery-lines-bars-and-markers-errorbar-subsample-py