### New file

In [3]:
%matplotlib inline

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
import os
import matplotlib.colors as mc
import colorsys

In [4]:
distros = ['mix', 'gau 1', 'gau 2', 'exp']
stepsizes = ['const', '2/sqrt_k', '0.002/sqrt_k']

tau_vals = [0.1, 0.3, 0.5, 0.9, 0.99]
N_g = 12 # N_generation
N_s = 10 # N_shuffle

N_q = len(tau_vals)

c_Norm = colors.Normalize(vmin=0, vmax=1)
scalarMap = cmx.ScalarMappable(norm=c_Norm, cmap=plt.get_cmap('cool'))

In [5]:
def get_settings(distro_lst, datasize_lst, stepsize_lst):
    if (len(distro_lst) > 1) and (len(datasize_lst)==1) and (len(stepsize_lst)==1):
        setting_lst = [[i, datasize_lst[0], stepsize_lst[0]] for i in distro_lst]
    elif (len(distro_lst)==1) and (len(datasize_lst) > 1) and (len(stepsize_lst)==1):
        setting_lst = [[distro_lst[0], i, stepsize_lst[0]] for i in datasize_lst]
    elif (len(distro_lst)==1) and (len(datasize_lst)==1) and (len(stepsize_lst) > 1):
        setting_lst = [[distro_lst[0], datasize_lst[0], i] for i in stepsize_lst]
    else: raise Exception("Setting inputs are wrong!")
    return setting_lst, len(stepsize_lst)>1

In [6]:
def get_one_dt(distro, datasize):
#     return np.ones(size)
    if distro == 'gau 1':
        return np.random.normal(2, 18, datasize)
    elif distro == 'gau 2':
        return np.random.normal(0, 0.001, datasize)
    elif distro == 'mix':
        # mean: -1.3
        # std: 30.779035604224564
        # var: 947.3490327261234
        sizes = np.array([0.3, 0.2, 0.1, 0.15, 0.25]) * datasize
        d1 = np.random.normal(2, 7, int(sizes[0]))
        d2 = np.random.normal(0, 0.7, int(sizes[1]))
        d3 = np.random.normal(36, 26, int(sizes[2]))
        d4 = np.random.normal(5, 77, int(sizes[3]))
        d5 = np.random.normal(-77, 7, int(sizes[4]))
        mix_lst = np.append(d1, np.append(d2, np.append(d3, np.append(d4, d5))))
        np.random.shuffle(mix_lst)
        return mix_lst
    elif distro == 'exp':
        return np.random.exponential(scale=1, size=datasize)
    else: raise Exception("distribution doesn't work!")

In [8]:
def get_dataset(distro, datasize, g_test):
    if g_test:
        dataset = np.zeros((N_g, datasize))
        for i in range(N_g):
            dataset[i] = get_one_dt(distro, datasize)
    else:
        dataset = get_one_dt(distro, datasize)
    return dataset

In [13]:
def get_q_batch(dataset, tau_lst):
    if len(dataset.shape) != 1:
        raise Exception('Dataset for q_batch calculation of wrong shape: ' + str(dataset.shape))
        
    q_batch = np.zeros(len(tau_lst))
    for i, tau in enumerate(tau_lst):
        q_batch[i] = np.percentile(dataset, tau*100)
    return q_batch

get_q_batch(np.random.uniform(0,1, 10000), [0.5, 0.7, 0.9])

array([0.50132702, 0.70339174, 0.89988036])

In [21]:
def get_q_batches(dataset, tau_lst):
    # g_test = False
    if len(dataset.shape) == 1: 
        return get_q_batch(dataset, tau_lst)
    else:
        q_batches = np.zeros((dataset.shape[0], len(tau_lst)))
        for i in range(q_batches.shape[0]):
            q_batches[i] = get_q_batch(dataset[i], tau_lst)
    return q_batches

 

In [22]:
def quantile_sgd_compare(distro_lst, datasize_lst, stepsize_lst, 
                         g_test=False, s_test=False, tau_lst=tau_vals):
    # generate different settings
    setting_lst, is_stepsize = get_settings(distro_lst, datasize_lst, stepsize_lst)
    
    # if only stepsize changes, generate dataset and q_batches
    dataset, q_batches = 0, 0
    if is_stepsize:
        dataset = get_dataset(distro_lst[0], datasize_lst[0], g_test)
        q_batches = get_q_batches(dataset, tau_lst)
        
#     # for each setting = [distro, datasize, stepsize]
#     for idx, setting in enumerate(setting_lst):
#         # generate all the data
#         distro, datasize, stepsize = setting[idx][0], setting[idx][1], setting[idx][2]
#         if not is_stepsize:
#             dataset = get_dataset(distro, datasize, g_test, s_test)
#             q_batches = get_q_batches(dataset, tau_lst, g_test)
#         q_sgd_res, q_sgd_proc = get_q_sgd(dataset, stepsize, s_test, tau_lst)
        
#         # generate charts and tables?
#         name = get_name(distro, datasize, stepsize, tau_list, g_test, s_test)
#         charts = get_charts(tau_lst, q_batches, q_sgd_res, q_sgd_proc, name)
# #         tables = get_tables(tau_lst, q_batches, q_sgd_res, q_sgd_proc, name)

<!---
### Always have $q_k = x$ for each x in the data stream

When $x - q_k > 0$, we have $l(q_k) = \tau(x-q_k)$:
\begin{align}
q_{k+1} & = q_k - \frac{l(q_k)}{l'(q_k)} \\
        & = q_k - \frac{\tau(x-q_k)}{-\tau} \\
        & = q_k - (- x + q_k) \\
        & = x
\end{align}

Same happens when $x - q_k < 0$
-->

In [24]:
quantile_sgd_compare(distro_lst=['gau 1'], datasize_lst=[100], stepsize_lst=['const', 'others'], g_test=True)

g_test True
