### New file

In [1]:
%matplotlib inline

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell


import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

In [2]:
distros = ['mix', 'gau 1', 'gau 2', 'exp']
stepsizes = ['const', '2/sqrt_k', '0.002/sqrt_k']

tau_vals = [0.1, 0.3, 0.5, 0.9, 0.99]
N_g = 12 # N_generation
N_s = 10 # N_shuffle

N_q = len(tau_vals)

c_Norm = colors.Normalize(vmin=0, vmax=1)
scalarMap = cmx.ScalarMappable(norm=c_Norm, cmap=plt.get_cmap('cool'))

In [3]:
def get_settings(distro_lst, datasize_lst, stepsize_lst):
    len_lst = [len(distro_lst), len(datasize_lst), len(stepsize_lst)]
    if len_lst.count(1) != len(len_lst)-1: raise Exception("Setting inputs are wrong!")
    
    N_settings = max((len_lst))
    setting_lst = []
    for lst in [distro_lst, datasize_lst, stepsize_lst]:
        if len(lst)==1: 
            lst = lst*N_settings
        setting_lst.append(lst)
    return np.asarray(setting_lst).T, len(stepsize_lst)>1


In [4]:
def get_one_dt(distro, datasize):
#     return np.ones(size)
    if distro == 'gau 1':
        return np.random.normal(2, 18, datasize)
    elif distro == 'gau 2':
        return np.random.normal(0, 0.001, datasize)
    elif distro == 'mix':
        # mean: -1.3
        # std: 30.779035604224564
        # var: 947.3490327261234
        mix_lst = np.zeros(datasize)
        sizes = np.array([0.3, 0.2, 0.1, 0.15, 0.25])
        mixtures = [(2,7), (0,0.7), (36, 26), (5,77), (-77,7)]
        acc_sizes = [sum(sizes[:i+1]) for i in range(len(sizes))]

        for d_idx in range(datasize):
            rdn = np.random.uniform(0,1)
            mix_id = 0
            for m_id in acc_sizes:
                if rdn > m_id:
                    mix_id += 1
                else:break
            data_point = np.random.normal(mixtures[mix_id][0], mixtures[mix_id][1])
            mix_lst[d_idx] = data_point
        return mix_lst
    elif distro == 'exp':
        return np.random.exponential(scale=1, size=datasize)
    else: raise Exception("distribution doesn't work!")

In [5]:
def get_dataset(distro, datasize, g_test):
    if g_test:
        dataset = np.zeros((N_g, datasize))
        for i in range(N_g):
            dataset[i] = get_one_dt(distro, datasize)
    else:
        dataset = get_one_dt(distro, datasize)
    return dataset

In [6]:
def get_q_true(distro, tau_lst):
    if tau_lst == tau_vals:
        if distro=='gau 1':
            return np.asarray([-21.06792817980280840537, 
                              -7.43920922874473411269,
                              2,
                              25.06792817980280840537,
                              43.87426173273513981594])
        elif distro=='gau 2':
            return np.asarray([-0.001281551565544600466965,
                              -5.244005127080407840383E-4,
                              0,
                              0.001281551565544600466965,
                              0.002326347874040841100886])
        elif distro=='mix':
            # sampled from 100000000 datapoints
            return np.asarray([-80.28496182,
                               -29.02324254,
                               -0.36011575,
                               36.69268923,
                               120.7676231])
        elif distro=='exp':
            return np.asarray([0.1053605156578263012275,
                              0.3566749439387323789126,
                              0.6931471805599453094172,
                              2.302585092994045684018,
                              4.605170185988091368036])
    raise Exception('tau_lst should be tau_vals')
    

In [7]:
def get_q_batch(dataset, tau_lst):
    if len(dataset.shape) != 1:
        raise Exception('Dataset for q_batch calculation of wrong shape: ' + str(dataset.shape))
        
    q_batch = np.zeros(len(tau_lst))
    for i, tau in enumerate(tau_lst):
        q_batch[i] = np.percentile(dataset, tau*100)
    return q_batch

In [8]:
def get_q_batches(dataset, tau_lst):
    # g_test = False
    if len(dataset.shape) == 1: 
        return get_q_batch(dataset, tau_lst)
    else:
        q_batches = np.zeros((dataset.shape[0], len(tau_lst)))
        for i in range(q_batches.shape[0]):
            q_batches[i] = get_q_batch(dataset[i], tau_lst)
    return q_batches

In [9]:
def set_stepsize(k, stepsize):
    if stepsize=='const':
        return 1
    elif stepsize=='2/sqrt_k':
        return 2/math.sqrt(k)
    elif stepsize=='0.002/sqrt_k':
        return 0.002/math.sqrt(k)
    raise Exception('stepsize parameter is wrong')

In [10]:
def get_procs(dataset, step_size, tau_lst):
    if len(dataset.shape)!= 1: 
        raise Exception('Dataset for get_procs() of wrong shape:' + str(dataset.shape)+ ', should be 1d array')
        
    procs = np.zeros((len(tau_lst), dataset.shape[0]))
    q = 0
    for idx, tau in enumerate(tau_lst):
        q_sgd_proc = procs[idx]
        # change stepsize
        if step_size != 'frugal':
            for k, x in enumerate(dataset):
                alpha = set_stepsize(k+1, step_size)
                if x > q:
                    q += alpha*tau
                else:
                    q -= alpha*(1-tau)
                q_sgd_proc[k] = q
        
        # frugal
        else:
            rdn_lst = np.random.uniform(0,1, dataset.shape[0])
            for k, x in enumerate(dataset):
                rdn = rdn_lst[k]
                if x > q and rdn > 1-tau:
                    q += 1
                elif x < q and rdn > tau:
                    q -= 1
                q_sgd_proc[k] = q
    return procs

In [11]:
def get_res(procs):
    if len(procs.shape)!=2:raise Exception('Procs of wrong shape:' + str(procs.shape)+ ', should be 2d array')
    return procs[:, -1]

In [12]:
def get_q_ests(dataset, step_size, tau_lst):

    if len(dataset.shape)>2:
        raise Exception('Dataset for q_est calculation of wrong shape:' + str(dataset.shape)+ ', should be 1d or 2d array')
    if len(dataset.shape)==1:
        procs = get_procs(dataset, step_size, tau_lst)
        res = get_res(procs)
    else:
        res = np.zeros((dataset.shape[0], len(tau_lst)))
        procs = np.zeros((dataset.shape[0], len(tau_lst), dataset.shape[1]))
        for idx, dt in enumerate(dataset):
            procs[idx] = get_procs(dt, step_size, tau_lst)
            res[idx] = get_res(procs[idx])
    return res, procs

In [13]:
# dataset = get_dataset('gau 1', 1000, g_test=True) * 50 - 200
# print(dataset.shape)

# # proc1 = get_procs(dataset, 'const', tau_vals)
# ## res1 = get_res(proc1)
# # proc2 = get_procs(dataset, 'frugal', tau_vals)
# # plt.plot(proc1.T)ad
# # plt.plot(proc2.T)
# # plt.show()


# res, proc = get_q_ests(dataset, 'const', tau_vals)
# print(res)
# print(proc)

In [14]:
def get_normalized_e(true, batches, est):
    upper = est - batches
    bottom = true - batches
    return abs(upper/bottom)

### Plots

In [20]:
def quantile_sgd_compare(distro_lst, datasize_lst, stepsize_lst, 
                         g_test=False, s_test=False, tau_lst=tau_vals, 
                        chart_lst = []):
    
    if g_test and s_test: raise Exception("g_test and s_test can't both be true")
    
    # generate different settings
    setting_lst, is_stepsize = get_settings(distro_lst, datasize_lst, stepsize_lst,)
    print (setting_lst)
    # if only stepsize changes, generate dataset and q_batches
    dataset, q_batches = 0, 0
    if is_stepsize:
        dataset = get_dataset(distro_lst[0], datasize_lst[0], g_test)
        q_batches = get_q_batches(dataset, tau_lst)
        
    # for each setting = [distro, datasize, stepsize]
    for idx, setting in enumerate(setting_lst):
        # generate all the data
        distro, datasize, stepsize = setting[0], int(setting[1]), setting[2]
        q_true = get_q_true(distro, tau_lst)
#         print (q_true)
        if not is_stepsize:
            dataset = get_dataset(distro, datasize, g_test)
            q_batches = get_q_batches(dataset, tau_lst)
        if s_test:
            shuffled_dt = np.zeros((N_s, datasize))
            for i in range(N_s):
                np.random.shuffle(dataset)
                shuffled_dt[i] = dataset
            dataset = shuffled_dt
        q_est_res, q_est_proc = get_q_ests(dataset, stepsize, tau_lst)
        print (q_est_proc)
        E = get_normalized_e(q_true, q_batches, q_est_res)
        print(E)
        
#         # generate charts and tables?
#         name = naming(distro, datasize, stepsize, tau_list, g_test, s_test)
#         charts = get_charts(tau_lst, q_batches, q_sgd_res, q_sgd_proc, name)
# #         tables = get_tables(tau_lst, q_batches, q_sgd_res, q_sgd_proc, name)

In [21]:
def sgd_frugal_compare(distro_lst, datasize, tau_lst=tau_vals):
    #distro changes, use the biggest datasize, do not shuffle
#     fig, (ax1,ax2) = plt.subplots(1,2)
#     fig.set_size_inches(12, 7)
    for distro in distro_lst:
        fig, (ax1,ax2) = plt.subplots(1,2)
        fig.set_size_inches(16, 4)
        
        dataset = get_dataset(distro, datasize, False)
        q_batches = get_q_batches(dataset, tau_lst)
        
        sgd_res, sgd_proc = get_q_ests(dataset, 'const', tau_lst)
        ax1.plot(sgd_proc.T, label='sgd')
        
        N_frugal = 10
        frugal_res = np.zeros((N_frugal, len(tau_lst)))
        frugal_proc = np.zeros((N_frugal, len(tau_lst), datasize))
        for i in range(N_frugal):
            frugal_res[i], frugal_proc[i] = get_q_ests(dataset, 'frugal', tau_lst)
            for proc in frugal_proc[i]:
                ax2.plot(proc, label = (str(tau_lst[i])+'quantile') if i==0 else '')
        ax2.legend()
        ax1.set_title('sgd for '+distro)
        ax2.set_title('frugal for '+distro)
        #print chart for everyone
        
# sgd_frugal_compare(distros, 1000)

<!---
### Always have $q_k = x$ for each x in the data stream

When $x - q_k > 0$, we have $l(q_k) = \tau(x-q_k)$:
\begin{align}
q_{k+1} & = q_k - \frac{l(q_k)}{l'(q_k)} \\
        & = q_k - \frac{\tau(x-q_k)}{-\tau} \\
        & = q_k - (- x + q_k) \\
        & = x
\end{align}

Same happens when $x - q_k < 0$
-->

In [22]:
quantile_sgd_compare(distro_lst=['mix'], 
                     datasize_lst=[10, 100, 1000], 
                     stepsize_lst=['const'], 
                     g_test=False,
                     s_test=False)

[['mix' '10' 'const']
 ['mix' '100' 'const']
 ['mix' '1000' 'const']]
[[-9.0000000e-01 -8.0000000e-01 -7.0000000e-01 -6.0000000e-01
  -5.0000000e-01 -1.4000000e+00 -1.3000000e+00 -2.2000000e+00
  -2.1000000e+00 -2.0000000e+00]
 [-2.7000000e+00 -2.4000000e+00 -2.1000000e+00 -1.8000000e+00
  -1.5000000e+00 -2.2000000e+00 -1.9000000e+00 -2.6000000e+00
  -2.3000000e+00 -2.0000000e+00]
 [-2.5000000e+00 -2.0000000e+00 -1.5000000e+00 -1.0000000e+00
  -5.0000000e-01 -1.0000000e+00 -5.0000000e-01 -1.0000000e+00
  -5.0000000e-01 -4.4408921e-16]
 [-1.0000000e-01  8.0000000e-01  1.7000000e+00  2.6000000e+00
   2.5000000e+00  2.4000000e+00  2.3000000e+00  2.2000000e+00
   3.1000000e+00  3.0000000e+00]
 [ 2.9900000e+00  3.9800000e+00  4.9700000e+00  4.9600000e+00
   4.9500000e+00  4.9400000e+00  4.9300000e+00  4.9200000e+00
   5.9100000e+00  5.9000000e+00]]
[7.80547244 1.80448886 0.76455743 0.54355112 0.10394113]
[[ -0.9   -0.8   -0.7   -1.6   -2.5   -2.4   -2.3   -2.2   -2.1   -2.
   -2.9   -2.8   