### New file

In [243]:
%matplotlib inline

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
import os
import matplotlib.colors as mc
import colorsys

In [244]:
distros = ['mix', 'gau 1', 'gau 2', 'exp']
stepsizes = ['const', '2/sqrt_k', '0.002/sqrt_k']

tau_vals = [0.1, 0.3, 0.5, 0.9, 0.99]
N_g = 12 # N_generation
N_s = 10 # N_shuffle

N_q = len(tau_vals)

c_Norm = colors.Normalize(vmin=0, vmax=1)
scalarMap = cmx.ScalarMappable(norm=c_Norm, cmap=plt.get_cmap('cool'))

In [245]:
def get_settings(distro_lst, datasize_lst, stepsize_lst, est_lst):
    len_lst = [len(distro_lst), len(datasize_lst), len(stepsize_lst), len(est_lst)]
    if len_lst.count(1) != len(len_lst)-1: raise Exception("Setting inputs are wrong!")
    
    N_settings = max((len_lst))
    setting_lst = []
    for lst in [distro_lst, datasize_lst, stepsize_lst, est_lst]:
        if len(lst)==1: 
            lst = lst*N_settings
        setting_lst.append(lst)
    return np.asarray(setting_lst).T, len(stepsize_lst)>1


In [246]:
get_settings(distro_lst=['mix'], 
             datasize_lst=[11, 100, 200], 
             stepsize_lst=['const'], 
             est_lst = ['sgd'])[0]

array([['mix', '11', 'const', 'sgd'],
       ['mix', '100', 'const', 'sgd'],
       ['mix', '200', 'const', 'sgd']], dtype='<U5')

In [247]:
def get_one_dt(distro, datasize):
#     return np.ones(size)
    if distro == 'gau 1':
        return np.random.normal(2, 18, datasize)
    elif distro == 'gau 2':
        return np.random.normal(0, 0.001, datasize)
    elif distro == 'mix':
        # mean: -1.3
        # std: 30.779035604224564
        # var: 947.3490327261234
        mix_lst = np.zeros(datasize)
        sizes = np.array([0.3, 0.2, 0.1, 0.15, 0.25])
        mixtures = [(2,7), (0,0.7), (36, 26), (5,77), (-77,7)]
        acc_sizes = [sum(sizes[:i+1]) for i in range(len(sizes))]

        for d_idx in range(datasize):
            rdn = np.random.uniform(0,1)
            mix_id = 0
            for m_id in acc_sizes:
                if rdn > m_id:
                    mix_id += 1
                else:break
            data_point = np.random.normal(mixtures[mix_id][0], mixtures[mix_id][1])
            mix_lst[d_idx] = data_point
        return mix_lst
    elif distro == 'exp':
        return np.random.exponential(scale=1, size=datasize)
    else: raise Exception("distribution doesn't work!")

In [248]:
def get_dataset(distro, datasize, g_test):
    if g_test:
        dataset = np.zeros((N_g, datasize))
        for i in range(N_g):
            dataset[i] = get_one_dt(distro, datasize)
    else:
        dataset = get_one_dt(distro, datasize)
    return dataset

In [249]:
def get_q_batch(dataset, tau_lst):
    if len(dataset.shape) != 1:
        raise Exception('Dataset for q_batch calculation of wrong shape: ' + str(dataset.shape))
        
    q_batch = np.zeros(len(tau_lst))
    for i, tau in enumerate(tau_lst):
        q_batch[i] = np.percentile(dataset, tau*100)
    return q_batch

In [250]:
def get_q_batches(dataset, tau_lst):
    # g_test = False
    if len(dataset.shape) == 1: 
        return get_q_batch(dataset, tau_lst)
    else:
        q_batches = np.zeros((dataset.shape[0], len(tau_lst)))
        for i in range(q_batches.shape[0]):
            q_batches[i] = get_q_batch(dataset[i], tau_lst)
    return q_batches

In [251]:
def get_q_ests(dataset, step_size, tau_lst, est_type):

    if len(dataset.shape)!=2: raise Exception('Dataset for q_est calculation of wrong shape:' + str(dataset.shape))
    
    res = np.zeros((dataset.shape[0], len(tau_lst)))
    proc = np.zeros((dataset.shape[0], len(tau_lst), dataset.shape[1]))
    print ('res ', res.shape)
    print ('proc', proc.shape)
    return res, proc

In [252]:
def quantile_sgd_compare(distro_lst, datasize_lst, stepsize_lst, est_lst,
                         g_test=False, s_test=False, tau_lst=tau_vals):
    
    if g_test and s_test: raise Exception("g_test and s_test can't both be true")
    
    # generate different settings
    setting_lst, is_stepsize = get_settings(distro_lst, datasize_lst, stepsize_lst, est_lst)
    print (setting_lst)
    # if only stepsize changes, generate dataset and q_batches
    dataset, q_batches = 0, 0
    if is_stepsize:
        dataset = get_dataset(distro_lst[0], datasize_lst[0], g_test)
        q_batches = get_q_batches(dataset, tau_lst)
        
    # for each setting = [distro, datasize, stepsize]
    for idx, setting in enumerate(setting_lst):
        # generate all the data
        distro, datasize, stepsize, est_type = setting[0], int(setting[1]), setting[2], setting[3]
        if not is_stepsize:
            dataset = get_dataset(distro, datasize, g_test)
            q_batches = get_q_batches(dataset, tau_lst)
        q_est_res, q_est_proc = get_q_ests(dataset, stepsize, tau_lst, est_type)
        
#         # generate charts and tables?
#         name = get_name(distro, datasize, stepsize, tau_list, g_test, s_test)
#         charts = get_charts(tau_lst, q_batches, q_sgd_res, q_sgd_proc, name)
# #         tables = get_tables(tau_lst, q_batches, q_sgd_res, q_sgd_proc, name)

<!---
### Always have $q_k = x$ for each x in the data stream

When $x - q_k > 0$, we have $l(q_k) = \tau(x-q_k)$:
\begin{align}
q_{k+1} & = q_k - \frac{l(q_k)}{l'(q_k)} \\
        & = q_k - \frac{\tau(x-q_k)}{-\tau} \\
        & = q_k - (- x + q_k) \\
        & = x
\end{align}

Same happens when $x - q_k < 0$
-->

In [253]:
quantile_sgd_compare(distro_lst=['mix'], 
                     datasize_lst=[11, 100, 200], 
                     stepsize_lst=['const'], 
                     est_lst = ['sgd'],
                     g_test=True,
                     s_test=False)

[['mix' '11' 'const' 'sgd']
 ['mix' '100' 'const' 'sgd']
 ['mix' '200' 'const' 'sgd']]
res  (12, 5)
proc (12, 5, 11)
res  (12, 5)
proc (12, 5, 100)
res  (12, 5)
proc (12, 5, 200)
