### New file

In [1]:
import math
import numpy as np
import os
from Dataset_generation import get_dataset, get_q_batches, get_q_true, get_shuffled_dataset
from Output_data_generation import initialize_folders, get_settings, get_file_name, save_data
from Quantile_procs import get_procs

get_procs() using method p2
p2
stepsize is const


In [2]:
sgd_lst = ['distro', 'data_size', 'step_size', 'data_sequence']        
tau_vals = [0.1, 0.3, 0.5, 0.9, 0.99]

In [3]:
def get_res(procs):
    if len(procs.shape)!=2:raise Exception('Procs of wrong shape:' + str(procs.shape)+ ', should be 2d array')
    
    return procs[:, -1]

In [4]:
# Get quantile estimates
def get_q_ests(dataset, stepsize, tau_lst):
    print ("get_q_est stepsize is {}".format(stepsize))
    if len(dataset.shape)>2:
        raise Exception('Dataset for q_est calculation of wrong shape: {}, should be 1d or 2d array'
                        .format(str(dataset.shape)))
    if len(dataset.shape)==1:
        procs = get_procs(dataset, tau_lst, method_name='p2', stepsize=stepsize)
        res = get_res(procs)
    else:
        res = np.zeros((dataset.shape[0], len(tau_lst)))
        procs = np.zeros((dataset.shape[0], len(tau_lst), dataset.shape[1]))
        for idx, dt in enumerate(dataset):
            procs[idx] = get_procs(dt, tau_lst, method_name='p2', stepsize=stepsize)
            res[idx] = get_res(procs[idx])
    return res, procs

In [5]:
def get_normalized_e(true, batches, est):
    upper = est - batches
    bottom = true - batches
    return (upper/bottom)

### Main functions 

In [6]:
def quantile_method_compare(folder_name, distro_lst, datasize_lst, stepsize_lst, 
                         g_test=False, s_test=False, tau_lst=[0.1, 0.3, 0.5, 0.9, 0.99], 
                        ):
    
    if g_test and s_test: raise Exception("g_test and s_test can't both be true")
    
    # generate different settings
    setting_lst, changed_setting = get_settings(distro_lst, datasize_lst, stepsize_lst,)
    print (setting_lst)
    
    # if only stepsize changes, generate dataset and q_batches
    dataset, q_batches = 0, 0
    if len(distro_lst)==1 and len(datasize_lst)==1:
        dataset = get_dataset(distro_lst[0], datasize_lst[0], g_test)
        q_batches = get_q_batches(dataset, tau_lst)
        
    # for each setting = [distro, datasize, stepsize]
    for idx, setting in enumerate(setting_lst):
        
        # generate all the data
        distro, datasize, stepsize = setting[0], int(setting[1]), setting[2]
        q_true = get_q_true(distro, tau_lst)
        if len(distro_lst)!=1 or len(datasize_lst)!=1:
            dataset = get_dataset(distro, datasize, g_test)
            q_batches = get_q_batches(dataset, tau_lst)
        if s_test: 
            dataset = get_shuffled_dataset(dataset, datasize, s_test)
        q_est_res, q_est_proc = get_q_ests(dataset, stepsize, tau_lst)
        E = get_normalized_e(q_true, q_batches, q_est_res)
        
        data_dict = {
            'q_true': q_true,
            'q_batches': q_batches,
            'q_est_res': q_est_res,
            'q_est_proc': q_est_proc,
            'E': E
        }
        # generate charts and tables?
        file_name = get_file_name(changed_setting, distro, datasize, stepsize, s_test)
        save_data(folder_name, file_name, tau_lst, data_dict)

In [7]:
def all_method_comparisons(compare_lst, root_folder='SGD'):
    
    args_dict = {
        'distro': (distros, [1000], ['const'], True),
        'data_size': (['gau_2'], [100, 1000, 10000], ['const'], True),
        'step_size': (['gau_1'], [1000], stepsizes, True),
        'data_sequence':(['gau_1', 'gau_1'], [1000], ['const'], False, True),
    }
    
    for t in compare_lst:
        print (t)
        folder_name = "Experiment_results/{}/{}/".format(root_folder, t)
        
        args = args_dict.get(t)
        distro_lst, datasize_lst, stepsize_lst =  args[0], args[1], args[2]
        g_test = args[3]
        s_test = False if len(args)<5 else args[4]
        tau_lst = tau_vals
        print (s_test)
        
        quantile_method_compare(folder_name, distro_lst, datasize_lst, stepsize_lst, g_test, s_test, tau_lst)
            

In [8]:
distros = ['mix', 'gau_1', 'gau_2', 'exp']
stepsizes = ['const', '2_div_sqrt_k', '0.002_div_sqrt_k']

N_g = 12 # N_generation
N_s = 10 # N_shuffle

In [13]:
# Run all functions

main_folder = 'P2'
compare_lst = ['data_sequence']
initialize_folders(main_fd = 'Experiment_results/', fd_lst = ['P2'], 
                       sub_fd_lst = ['P2/'], subsub_fd_lst = compare_lst)
all_method_comparisons(compare_lst, main_folder)

data_sequence
True
[['gau_1' '1000' 'const']
 ['gau_1' '1000' 'const']]
get_q_est stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
Experiment_results/P2/data_sequence/shuffle_q_true.txt
Experiment_results/P2/data_sequence/shuffle_q_batches.txt
Experiment_results/P2/data_sequence/shuffle_q_est_res.txt
Experiment_results/P2/data_sequence/shuffle_q_est_proc.txt
Experiment_results/P2/data_sequence/shuffle_E.txt
get_q_est stepsize is const
get_procs() using method p2
p2
stepsize is const
get_procs() using method p2
p2
stepsize is const
get_

In [10]:
# def sgd_frugal_compare(distro_lst, datasize, tau_lst=tau_vals):
#     #distro changes, use the biggest datasize, do not shuffle
#     for distro in distro_lst:
#         q_true = get_q_true(distro, tau_lst)
#         dataset = get_dataset(distro, datasize, False)
#         q_batches = get_q_batches(dataset, tau_lst)
        
#         sgd_res, sgd_proc = get_q_ests(dataset, 'const', tau_lst)
        
#         N_frugal = 20
#         frugal_res = np.zeros((N_frugal, len(tau_lst)))
#         frugal_proc = np.zeros((N_frugal, len(tau_lst), datasize))
#         for i in range(N_frugal):
#             frugal_res[i], frugal_proc[i] = get_q_ests(dataset, 'frugal', tau_lst)
        
#         ax_name = 'Tested on '+distro+' distritbution with '+str(datasize)+' data points'
#         fig, lgd = plot_procs(ax_name, tau_vals, q_true, frugal_proc, sgd_proc)
#         title = fig.suptitle('Quantile Estimation: Frugal vs SGD')

#         fd = "Experiment_results/Frugal_SGD/"
#         plt.savefig(fd+distro+'.png', bbox_extra_artists=(lgd, title), bbox_inches='tight')
        
        
# sgd_frugal_compare(distros, 1000)

<!---
### Always have $q_k = x$ for each x in the data stream

When $x - q_k > 0$, we have $l(q_k) = \tau(x-q_k)$:
\begin{align}
q_{k+1} & = q_k - \frac{l(q_k)}{l'(q_k)} \\
        & = q_k - \frac{\tau(x-q_k)}{-\tau} \\
        & = q_k - (- x + q_k) \\
        & = x
\end{align}

Same happens when $x - q_k < 0$
-->

In [14]:
from Plot import plot_charts

plot_charts("Experiment_results/P2/data_sequence")

Experiment_results/P2/data_sequence/shuffle_
