### Show quantile estimation with SGD  "works"

1. Test datasets:
    - Mixsture of 5 Gaussians
    - Sample of 100k from no.1
    - Gausian $\mu = 2, \sigma = 18$
    - Gausian $\mu = 0, \sigma = 0.001$
    - Sample 100 from no.1
    - Sample 100 from no.3
    - Exponential $\lambda$
    - My weird distribution lol
    
4. Test times:
    - 10 $\times$ dataset generation
    - 10 $\times$ SGD ???
    - 10 $\times$ shuffle (does the order matter?)
    
2. Q value: 0.1, 0.3, 0.5, 0.9, 0.99 

3. Step size: 
    - $\alpha_k = 1$
    - $\alpha_k = \frac{2}{\sqrt{k}}$
    - $\alpha_k = \frac{0.002}{\sqrt{k}}$
    - ?
    - ?
    - Why's the Newton method helpful?
       
5. Calculate $E = |q_{batch} - q_{sgd}|$

6. What's a small value of $E$?

7. Investigate the effect of datasize N
    - Convergence rate with same $\alpha_k$?
    - Different Q values?
    - ?

In [104]:
%matplotlib inline

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import math
import numpy as np
import matplotlib.pyplot as plt
import import_ipynb
from plt_quantile_comparison import plot_quantile_shuffles, plot_quantile_generations

In [105]:
distro_list = ['mix_gau', 'gau 1', 'gau 2', 'exp', 'my_distro']
stepsize_list = ['const', '2/sqrt_k', '0.002/sqrt_k']

tau_vals = [0.1, 0.3, 0.5, 0.9, 0.99]
N_g = 12 # N_generation
N_s = 10 # N_shuffle

N_q = len(tau_vals)

In [106]:
# all of the inputs are in the form of list
# e.g. distro = [mix_gaussian], datasize = [100, 100000], stepsize = [2]
def get_n_comp(distro, datasize, stepsize):
    print ('get_compared_setting()', distro, datasize, stepsize)
    if len(distro) > 1:
        if len(datasize)!= 1 or len(stepsize)!=1:
            raise Exception('The number of comp_setting is incorrect')
        return len(distro)
    elif len(datasize) > 1:
        if len(stepsize)!=1:
            raise Exception('The number of comp_setting is incorrect')        
        return len(datasize)
    elif len(stepsize) > 1:
        print ('step size')
        return len(stepsize)
    raise Exception('The number of comp_setting is incorrect')

def generate_setting_lst(distro, datasize, q_lst, g_test, s_test):
    setting_lst = []
    if len(distro)>1:
        for dis in distro:
            setting_lst.append([dis, datasize[0], q_lst, g_test, s_test])
    else:
        for size in datasize:
            setting_lst.append([distro[0], size, q_lst, g_test, s_test])
    print (setting_lst)
    return setting_lst

In [107]:
generate_setting_lst([1, 2, 3, 4], [1], tau_vals, False, False)

[[1, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False], [2, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False], [3, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False], [4, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False]]


[[1, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False],
 [2, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False],
 [3, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False],
 [4, 1, [0.1, 0.3, 0.5, 0.9, 0.99], False, False]]

### Dataset and quantile generation

In [108]:
# distro_list = ['mix_gau', 'gau 1', 'gau 2', 'exp', 'my_distro']

def generate_single_dataset(distro, datasize):
    if distro == 'gau 1':
        return np.random.normal(2, 18, datasize)
    elif distro == 'gau 2':
        return np.random.normal(0, 0.001, datasize)
    elif distro == 'mix':
        sizes = np.array([0.3, 0.2, 0.1, 0.15, 0.25]) * datasize
        d1 = np.random.normal(2.0, 7.0, int(sizes[0]))
        d2 = np.random.normal(0, 3.7, int(sizes[1]))
        d3 = np.random.normal(-9, 7, int(sizes[2]))
        d4 = np.random.normal(5, 77, int(sizes[3]))
        d5 = np.random.normal(-7, 7, int(sizes[4]))
        mix_lst = np.append(d1, np.append(d2, np.append(d3, np.append(d4, d5))))
        np.random.shuffle(mix_lst)
        return mix_lst
    elif distro == 'exp':
        return np.random.exponential(scale=1, size=datasize)

In [109]:
# test
# data = generate_single_dataset('gau 1', 1000)
# data2 = generate_single_dataset('gau 2', 1000)
# data_mix = generate_single_dataset('mix', 100000)
# data_exp = generate_single_dataset('exp', 1000)

# num_bins = 100
# plt.hist([data, data2], num_bins, alpha = 0.5, label=['a', 'b'])
# plt.hist(data_mix, num_bins, alpha = 0.5, label='mix')
# plt.hist(data_exp, num_bins, alpha = 0.5, label='mix')
# plt.legend()
# plt.show()

In [110]:
def generate_dataset(distro, datasize, g_test, s_test):
#     print ('generate_dataset', distro, datasize, g_test, s_test)
#     shape of dataset: 1*Datasize,
    if not g_test and not s_test:
        return generate_single_dataset(distro, datasize)
            
    elif g_test and not s_test:
        generated_dt = np.zeros((N_g, datasize))
        for i in range(N_g):
            generated_dt[i] = generate_single_dataset(distro, datasize)
        return generated_dt
    
    elif s_test and not g_test:
        shuffled_dt = np.zeros((N_s, datasize))
        dt = generate_single_dataset(distro, datasize)
        for i in range(N_s): 
            np.random.shuffle(dt)
            shuffled_dt[i] = dt
        return shuffled_dt
        
    dataset = np.zeros((N_g, N_s, datasize))
    for gen_id in range(N_g):
        dt = generate_single_dataset(distro, datasize)
        for shu_id in range(N_s):
#             print (gen_id, shu_id)
            np.random.shuffle(dt)
            dataset[gen_id][shu_id] = dt
    return dataset

In [111]:
def test_generate_dataset():
    distro = 'mix'
    datasize = 10000

    data_shuffle = generate_dataset(distro, datasize, False, True)
    data_gen = generate_dataset(distro, datasize, True, False)

    fig, (ax1, ax2) = plt.subplots(2,1,sharex=True)
    fig.set_size_inches(17, 12)
    num_bins = 100
    ax1.hist([dt for dt in data_shuffle[:3]], num_bins, alpha = 0.5, label=[str(i) for i in range(20)])
    ax1.legend()
    ax2.hist([dt for dt in data_gen[:3]], num_bins, alpha = 0.5, label=[str(i) for i in range(20)])
    ax2.legend()
    plt.show()
# test_generate_dataset()

In [112]:
def generate_q_batches_single_dataset(dataset, q_lst):
    q_batches = np.zeros(len(q_lst))
    for i in range(len(q_lst)):
        q_batches[i] = np.percentile(dataset, q_lst[i]*100)
    return q_batches

# test
# generate_q_batches_single_dataset(dataset, [0])[0] == min(dataset) and generate_q_batches_single_dataset(dataset, [1])[0] == max(dataset)

def generate_q_batches(dataset, q_lst, g_test, s_test):
#     print ('generate_q_batches', dataset, g_test, s_test)
    if not g_test:
        if s_test:
            return generate_q_batches_single_dataset(dataset[0], q_lst)
        else: 
            return generate_q_batches_single_dataset(dataset, q_lst)
    else:
        N_g = dataset.shape[0]
        q_batches = np.zeros((N_g, len(q_lst)))
        for i in range(N_g):
            if not s_test:
                q_batches[i] = generate_q_batches_single_dataset(dataset[i], q_lst)
            else:
                q_batches[i] = generate_q_batches_single_dataset(dataset[i][0], q_lst)
        return q_batches

In [113]:
def set_stepsize(k, stepsize):
    if stepsize=='const':
        return 1
    elif stepsize=='2/sqrt_k':
        return 2/math.sqrt(k)
    elif stepsize=='0.002/sqrt_k':
        return 0.002/math.sqrt(k)
    raise Exception('stepsize parameter is wrong')

def generate_single_q_sgd_proc_single_dataset(dataset, stepsize, tau):
    q_sgd_proc = np.zeros(len(dataset))
    q = 0
    for k, x in enumerate(dataset):
        alpha = set_stepsize(k+1, stepsize)
        if x > q:
            q += alpha*tau
        else:
            q -= alpha*(1-tau)
        q_sgd_proc[k] = q
    return q_sgd_proc

def generate_q_sgd_proc_single_dataset(dataset, stepsize, tau_lst):
    q_sgd_procs = np.zeros((len(tau_lst), len(dataset)))
    for idx, tau in enumerate(tau_lst):
        q_sgd_procs[idx] = generate_single_q_sgd_proc_single_dataset(dataset, stepsize, tau)
    return q_sgd_procs

def generate_q_sgd_proc(dataset, stepsize, tau_lst):
    N_q = len(tau_lst)
    N_dim = len(dataset.shape)
    
    q_shape = list(dataset.shape)
    q_shape.insert(-1, N_q)
    filename = 'q_sgd_procs.dat'
    q_sgd_procs = np.memmap(filename, dtype=np.float32, mode='w+', shape=tuple(q_shape))
    
    if N_dim==1:
        q_sgd_procs[:] = generate_q_sgd_proc_single_dataset(dataset, stepsize, tau_lst)
    
    elif N_dim==2:
        for idx, dt in enumerate(dataset):
            q_sgd_procs[idx] = generate_q_sgd_proc_single_dataset(dt, stepsize, tau_lst)
    
    elif N_dim==3:
        for g_idx, dt_g in enumerate(dataset):
            for s_idx, dt in enumerate(dt_g):
                q_sgd_procs[g_idx][s_idx] = generate_q_sgd_proc_single_dataset(dt, stepsize, tau_lst)
                
    else:  raise Exception('dataset of wrong type! check the dataset!')
        
    return filename, tuple(q_shape)

In [137]:
def generate_q_sgd_res(filename, shape):
    N_dim =len(shape)-1
    print (N_dim)

    q_sgd_proc = np.memmap(filename, dtype=np.float32, mode='r', shape=shape)
#     print (q_sgd_proc)
    
    if N_dim==1:
        return np.array(q_sgd_proc[:, -1])
        
    elif N_dim==2:
        return np.array(q_sgd_proc[:,:,-1])
    
    elif N_dim==3:
        return np.array(q_sgd_proc[:,:,:,-1])
    
    else: raise Execption('q_sgd_proc file got wrong')

In [138]:
# dataset = generate_dataset(distro='gau 1', datasize=10, g_test=True, s_test=True) 
# filename, shape = generate_q_sgd_proc(dataset, '2/sqrt_k', tau_vals)
# # q_sgd_proc = np.memmap(filename, dtype=np.float32, mode='r', shape=shape)

# # print (q_sgd_proc)
# generate_q_sgd_res(filename, shape)

In [139]:
# # test
# dataset = generate_dataset(distro='gau 1', datasize=10000, g_test=False, s_test=False) 
# # # print (generate_single_q_sgd_proc_single_dataset(dataset, '2/sqrt_k', 0.1)[9])
# # # print (generate_single_q_sgd_proc_single_dataset(dataset, '2/sqrt_k', 0.3)[9])
# # # print (generate_single_q_sgd_proc_single_dataset(dataset, '2/sqrt_k', 0.5)[9])
# # # print (generate_single_q_sgd_proc_single_dataset(dataset, '2/sqrt_k', 0.7)[9])
# # # print (generate_single_q_sgd_proc_single_dataset(dataset, '2/sqrt_k', 0.9)[9])

# filename, shape = generate_q_sgd_proc(dataset, '2/sqrt_k', tau_vals)
# procs = np.memmap(filename, dtype=np.float32, mode='r', shape=shape)
# print (procs.shape)

In [140]:
def generate_reusable_data(distro, datasize, q_lst, g_test, s_test):
    print ('generate_reusable_data', (distro, datasize, q_lst, g_test, s_test))
    dataset = generate_dataset(distro, datasize, g_test, s_test)
    q_batches = generate_q_batches(dataset, q_lst, g_test, s_test)
    return dataset, q_batches

def generate_q_sgds(dataset, stepsize, q_lst):
    filename, shape = generate_q_sgd_proc(dataset, stepsize, q_lst)
    # q_sgd_procs are stored by np.memmap
    q_sgd_res  = generate_q_sgd_res(filename, shape)
    return q_sgd_res, filename, shape


In [148]:
# dt, batches = generate_reusable_data('exp', 100, tau_vals, True, True)
# print (batches)
# print ('-----------------------')
# generate_q_sgds(dt, '2/sqrt_k', tau_vals)

### Plots

In [16]:
def draw_charts(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, charts):
#     return plts
    return 4

def draw_tables(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, charts):
#     return tbls
    return 5

In [155]:
# Conduct experiments on quantile estimation with SGD with regards to different aspects
# Experiment results are shown in charts and tables
# 
# Return: None? or charts and figures
#
# Inputs: 
#     1. Parameters for the experiments
#         - Settings to change: one of {distro, datasize, stepsize} (all elements are lists/dict)
#         - Settings to control: the left two of {distro, datasize, stepsize}
#                                g_test, s_test
#     2. Values to show in the results
#         - q_values to be compared
#         - charts to be showen in {plt_res, plt_proc, plt_e}
#         - tables to be shown in {tbl_res, tbl_proc}

def quantile_sgd_compare(distro, datasize, stepsize, 
                            g_test=False, s_test=False, q_lst=tau_vals, 
                           charts={'plt_res', 'plt_proc', 'plt_e'}, tables={'tbl_res, tbl_proc'}):
    dataset, q_batches = 0, 0
    N_comp = get_n_comp(distro, datasize, stepsize)
    
    print (N_comp)
    
    if len(stepsize)==1:
        setting_lst = generate_setting_lst(distro, datasize, q_lst, g_test, s_test)
    else:
        dataset, q_batches = generate_reusable_data(distro[0], datasize[0], q_lst, g_test, s_test)
        print (dataset.shape, q_batches)

    # for each round, generate all data and comparison results
    for i in range(N_comp):
        if len(stepsize)==1:
            print ("generate the {}th dataset and q_batches".format(i))
            lst = setting_lst[i]
            distro, datasize, q_lst, g_test, s_test = lst[0], lst[1], lst[2], lst[3], lst[4]
            dataset, q_batches = generate_reusable_data(distro, datasize, q_lst, g_test, s_test)
            print (dataset.shape, q_batches)
            q_sgd_res, proc_filename, proc_shape = generate_q_sgds(dataset, stepsize[], q_lst)
        else:
            print ('different q_sgd', i , 'for same dataset and q_batches')
            q_sgd_res, q_sgd_proc = generate_q_sgds(dataset, stepsize[i], q_lst, g_test, s_test)
        plts = draw_charts(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, charts)
        tbls = draw_tables(q_batches, q_sgd_res, q_sgd_proc, g_test, s_test, tables)
    # have them together?
    
    return
    

In [156]:
# distro_list = ['mix_gau', 'gau 1', 'gau 2', 'exp', 'my_distro']
# stepsize_list = ['const', '2/sqrt_k', '0.002/sqrt_k']

quantile_sgd_compare(['gau 1', 'gau 2'], [100], ['const'], True)

get_compared_setting() ['gau 1', 'gau 2'] [100] ['const']
2
[['gau 1', 100, [0.1, 0.3, 0.5, 0.9, 0.99], True, False], ['gau 2', 100, [0.1, 0.3, 0.5, 0.9, 0.99], True, False]]
generate the 0th dataset and q_batches
generate_reusable_data ('gau 1', 100, [0.1, 0.3, 0.5, 0.9, 0.99], True, False)
(12, 100) [[-19.04196132  -8.97318663   2.3023195   23.70965925  58.9300356 ]
 [-17.90705828  -5.87288091   5.4292448   28.26014218  43.10672813]
 [-16.57121625  -8.3411478    1.27560859  23.32977088  36.3732865 ]
 [-20.44620671  -8.79976311   3.44848812  27.09742454  41.20886638]
 [-15.68317696  -8.65730367  -1.53636358  22.33990591  52.29035506]
 [-17.03450979  -7.58478974   1.84268746  25.29846944  41.93457159]
 [-19.13170844  -7.2776373    2.80417467  27.11109485  39.00261908]
 [-19.68444954  -8.59247924   1.97495834  23.98273377  30.55112714]
 [-17.33733236  -5.69510831   3.86644233  25.9648976   33.53305345]
 [-19.59790353  -7.81968844  -0.39004706  21.08512102  40.43893453]
 [-25.0674647   -

Exception: stepsize parameter is wrong

In [None]:
# plt: errorbar
# https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/errorbar_subsample.html#sphx-glr-gallery-lines-bars-and-markers-errorbar-subsample-py