# Ch04 4.5.3 Numerical Example

In [278]:
import numpy as np 
import pandas as pd 

import matplotlib as mpl
import matplotlib.pyplot as plt

import datetime as dt
from adv_finance import bars, labeling, utils, sampling

from matplotlib.patches import Rectangle
import  matplotlib.dates as mdates

from tqdm import tqdm
from scipy.sparse import coo_matrix, csr_matrix

In [279]:
def get_ind_matrix(bar_ix, t1): 
    try: 
        n_row = len(bar_ix)
        n_col = len(t1)
        mat = csr_matrix((n_row, n_col), dtype='b')
        
        for i, (t0, t1) in tqdm(enumerate(t1.iteritems()), position=0): 
            mat[t0:t1+1, i] = 1
            
    except Exception as e:
        print(e)
        
    return mat


# 원본 소스
def seq_bootstrap_(ind_m, s_length=None):
    if s_length is None:
        s_length = ind_m.shape[1]

    phi = []
    while len(phi) < s_length:
        c = ind_m[phi].sum(axis=1) + 1
        avg_u = get_avg_uniqueness(ind_m, c)
        prob = (avg_u / avg_u.sum()).values
        phi += [np.random.choice(ind_m.columns, p=prob)]
    return phi


def get_avg_uniqueness_(ind_m, c=None):
    if c is None:
        c = ind_m.sum(axis=1)
        
    ind_m = ind_m.loc[c > 0]
    c = c.loc[c > 0]
    u = ind_m.div(c, axis=0)
    avg_u = u[u>0].mean()
    avg_u = avg_u.fillna(0)
    return avg_u


# @jit(nopython=True)
def get_avg_uniqueness(ind_m, c): 
    if c is None: 
        c = ind_m.sum(axis=1)
    
    u = ind_m / c
    avg_u = u.sum(axis=0) / ind_m.sum(axis=0)
    return avg_u



In [273]:
m = ind_m.todense()
m_ = m[:, [1]]
# ind_m_ = ind_m[:, [1]]
c = m_.sum(axis=1) + 1


In [281]:
%%timeit
avg_u = get_avg_uniqueness(m, c)

22.1 µs ± 53.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [282]:
avg_u

matrix([[0.83333333, 0.5       , 1.        ]])

21.8 µs ± 45.7 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [277]:
prob = avg_u / avg_u.sum()
np.squeeze(np.asarray(prob))

array([0.35714286, 0.21428571, 0.42857143])

In [150]:
t1 = pd.Series([2,3,5], index=[0,2,4])
barIx = range(t1.max()+1) # index of bars
ind_m = get_ind_matrix(barIx, t1)
ind_m

  
3it [00:00, 1341.89it/s]


<6x3 sparse matrix of type '<class 'numpy.int8'>'
	with 7 stored elements in Compressed Sparse Row format>

In [155]:
indM_ = indM[[0, 1]]

In [159]:
indM_.loc[c>0]

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,1
3,0,1
4,0,0
5,0,0


In [157]:
c = indM_.sum(axis=1)+1
c

0    2
1    2
2    3
3    2
4    1
5    1
dtype: int64

In [101]:
indM[:, []].A

array([], shape=(6, 0), dtype=int8)

In [103]:
indM[:, [0, 0]].A

array([[1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0]], dtype=int8)

In [72]:
mat = indM.todense()

In [99]:
mat[:, []]

array([], shape=(6, 0), dtype=int8)

## phi = [1]

In [138]:
phi = [1]        # phi = [1] corresponds to column 1, feature 2

# Following statements are taken from the SNIPPET 4.5 - function seqBootstrap(indM,sLength=None)
avg_u = pd.Series()

for i in np.arange(indM.shape[1]): 
    indM_ = indM[:, phi+[i]] # reduce indM
    avg_u.loc[i] = get_avg_uniqueness(indM_)

print('Average Uniqueness: \n',avg_u)

prob2 = avg_u/avg_u.sum()
print('Feature draw probabilities: \n', prob2)


Average Uniqueness: 
 0    0.833333
1    0.500000
2    1.000000
dtype: float64
Feature draw probabilities: 
 0    0.357143
1    0.214286
2    0.428571
dtype: float64


## phi = [1, 2]

In [65]:
phi = [1,2]        # phi = [1] corresponds to column 1, feature 2

# Following statements are taken from the SNIPPET 4.5 - function seqBootstrap(indM,sLength=None)
avg_u = pd.Series()

for i in np.arange(indM.shape[1]): 
    indM_ = indM[:, phi+[i]] # reduce indM
    avg_u.loc[i] = get_avg_uniqueness(indM_)

print('Average Uniqueness: \n',avg_u)

prob2 = avg_u/avg_u.sum()
print('Feature draw probabilities: \n', prob2)


Average Uniqueness: 
 0    0.833333
1    0.500000
2    0.500000
dtype: float64
Feature draw probabilities: 
 0    0.454545
1    0.272727
2    0.272727
dtype: float64


# APPENDIX

In [115]:
def get_ind_matrix_(bar_idx, t1):
    ind_m = pd.DataFrame(0, index=bar_idx,
                         columns=range(t1.shape[0]))
    for  i, (t0_, t1_) in enumerate(t1.iteritems()):
        ind_m.loc[t0_:t1_, i] = 1
    return ind_m


def get_avg_uniqueness_(ind_m, c=None):
    if c is None:
        c = ind_m.sum(axis=1)
    ind_m = ind_m.loc[c > 0]
    c = c.loc[c > 0]
    u = ind_m.div(c, axis=0)
    avg_u = u[u>0].mean()
    avg_u = avg_u.fillna(0)
    return avg_u



In [152]:
t1 = pd.Series([2,3,5], index=[0,2,4]) # t0,t1 for each feature obs
barIx = range(t1.max()+1) # index of bars
indM = get_ind_matrix_(barIx, t1)



In [122]:
c = indM.sum(axis=1)

In [128]:
c

0    1
1    1
2    2
3    1
4    1
5    1
dtype: int64

In [129]:
indM.loc[c > 0]

Unnamed: 0,0,1,2
0,1,0,0
1,1,0,0
2,1,1,0
3,0,1,0
4,0,0,1
5,0,0,1


In [126]:
u = indM.div(c, axis=0)

In [127]:
u

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.5,0.5,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,0.0,0.0,1.0


In [131]:
u[u > 0]

Unnamed: 0,0,1,2
0,1.0,,
1,1.0,,
2,0.5,0.5,
3,,1.0,
4,,,1.0
5,,,1.0


In [132]:
u[u > 0].mean()

0    0.833333
1    0.750000
2    1.000000
dtype: float64

In [120]:
indM[[]].sum(axis=1) + 1

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
dtype: float64

In [89]:
phi = []
avg_u = pd.Series()

i = 0

indM[[]].sum()

# for i in np.arange(indM.shape[1]): 
#     indM_ = indM[:, phi+[i]] # reduce indM
#     avg_u.loc[i] = get_avg_uniqueness(indM_)


Series([], dtype: float64)

In [91]:
pd.Series([1] * 5)

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [None]:
m = ind_m.todense()
m_ = m[:, [1]]
# ind_m_ = ind_m[:, [1]]
c = m_.sum(axis=1) + 1


In [None]:
%%timeit
avg_u = get_avg_uniqueness(m0, c)