In [53]:
import polars as pl
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt

In [49]:

import polars as pl 
import numpy as np 


def calc_change_since_pivot(current,last_pivot):
    if(last_pivot == 0): last_pivot = 1 ** (-100) # avoid division by 0
    perc_change_since_pivot = (current - last_pivot) / abs(last_pivot)
    return perc_change_since_pivot

def get_zigzag(idx, row, taip=None):
    return {
        "datetime": row[0],
        "value": row[1],
        "type": taip,
        "idx":idx,
    }

def create_label(
    df,
    threshold = 0.01,
    stop_loss = None,
):
    zigzags = []
    for idx,item in enumerate(df.select(["datetime","close"]).iterrows()):
        is_starting = (idx == 0)
        if is_starting:
            zigzags.append(get_zigzag(idx,item))
            continue  

        is_first_line = (len(zigzags) == 1) 
        if is_first_line:
            perc_change_since_pivot = calc_change_since_pivot(item[-1],zigzags[-1]["value"])
            if abs(perc_change_since_pivot) >= threshold:
                if perc_change_since_pivot > 0:
                    zigzags.append(get_zigzag(idx, item,"Peak"))
                    zigzags[0]["type"] = "Through"
                else:
                    zigzags.append(get_zigzag(idx, item, "Trough"))
                    zigzags[0]["type"] = "Peak" 
            continue 
        is_through = zigzags[-2]["value"] > zigzags[-1]["value"]
        is_ending = (idx == df.shape[0] - 1)
        last_pivot = float(zigzags[-1]["value"])
        # based on last pivot type, look for reversal or continuation
        if(is_through):
            perc_change_since_pivot = calc_change_since_pivot(item[1],zigzags[-1]["value"])
            is_reversing = (perc_change_since_pivot >= threshold) or is_ending
            is_continuing = item[-1] <= last_pivot
            if (is_continuing): 
                zigzags[-1] = get_zigzag(idx,item, "Trough")
            elif (is_reversing): 
                zigzags.append(get_zigzag(idx,item, "Peak"))
        else:
            perc_change_since_pivot = calc_change_since_pivot(item[-1],zigzags[-1]["value"])
            is_reversing = (perc_change_since_pivot <= -threshold) or is_ending
            is_continuing = item[-1] >= last_pivot
            if(is_continuing): 
                zigzags[-1] = get_zigzag(idx,item, "Peak")
            elif (is_reversing): 
                zigzags.append(get_zigzag(idx,item, "Trough"))

    zigzags = pl.DataFrame(zigzags)
    zigzags = zigzags.select([
        pl.all(),
        pl.col("datetime").shift(-1).alias("event_ends"),
        pl.col("value").shift(-1).alias("prevext")
    ])
    assert zigzags.shape[0] >=2 
    df = df.join(zigzags, on = "datetime", how = "left")
    df = df.select(
        [pl.col(item).fill_null(strategy = "forward") if item in ["prevext","event_ends"] else pl.col(item) for item in df.columns]
    )
    df = df.select(
        [pl.all(), (pl.col("prevext")/pl.col("close") - 1.0).alias("label")]
    )
    correct_label = [] 
    if stop_loss:
        total_returns = df.select("label").to_numpy().flatten() 
        return_list = df.select(
            ((pl.col("close").shift(-1) - pl.col("close"))/pl.col("close")).alias("return")
        )
        return_list = return_list.filter(pl.col("return").is_not_null()).to_numpy().flatten() 

        close_array = df.select("close").to_numpy().flatten() 

        for i in range(zigzags.shape[0]-1):
            start_idx = zigzags[i,"idx"]
            end_idx = zigzags[i+1,"idx"]
            for j in range(start_idx,end_idx):
                local_returns = return_list[j:min(end_idx + 1,len(return_list))]
                min_acc = 0 
                if total_returns[j] > 0:
                    min_acc = min((min(close_array[j+1:end_idx+1])-close_array[j])/close_array[j],0)
                else:
                    min_acc = max((max(close_array[j+1:end_idx+1])-close_array[j])/close_array[j],0)
                if total_returns[j] > 0:
                    if min_acc > -stop_loss:
                        correct_label.append(total_returns[j])
                    else:
                        correct_label.append(min_acc)
                else:
                    if min_acc < stop_loss:
                        correct_label.append(total_returns[j])
                    else:
                        correct_label.append(min_acc)
        #replace label of df 
        df = df[:len(correct_label),:]
        df.replace("label",pl.Series(correct_label))

    ## drop the front and the last trend data because zigzag is meanless on these data 
    df = df.filter((pl.col("datetime")>=zigzags[1,"datetime"])&(pl.col("datetime")<=(zigzags[-2,"datetime"])))

    df = df.select(
        [
            pl.col("datetime"),
            pl.col("close"),
            pl.col("datetime").alias("event_starts"),
            pl.col("event_ends"),
            pl.col("label")
        ]
    )

    return df 



In [14]:
filenames = glob.glob("../../catalog/data/genericdata_extended_bar.parquet/*.parquet")

In [45]:
import pyarrow as pa
ExtendedBar_SCHEMA  = pa.schema(
        {
            "bar_type": pa.dictionary(pa.int8(), pa.string()),
            #"instrument_id": pa.dictionary(pa.int64(), pa.string()),
            "open": pa.float64(),
            "high": pa.float64(),
            "low": pa.float64(),
            "close": pa.float64(),
            "volume": pa.float64(),
            "bids_value_level_0": pa.float64(),
            "bids_value_level_1": pa.float64(),
            "bids_value_level_2": pa.float64(),
            "bids_value_level_3": pa.float64(),
            "bids_value_level_4": pa.float64(),
            "asks_value_level_0": pa.float64(),
            "asks_value_level_1": pa.float64(),
            "asks_value_level_2": pa.float64(),
            "asks_value_level_3": pa.float64(),
            "asks_value_level_4": pa.float64(),
            "ts_event": pa.int64(),
            "ts_init": pa.int64(),
        },
    metadata={"type": "ExtendedBar"},
)

In [46]:
df = pl.read_parquet(filenames[0],use_pyarrow=True,pyarrow_options={"schema": ExtendedBar_SCHEMA})

In [47]:
df = df.select(
    [pl.all(),pl.col("ts_event").alias("datetime")]
)

In [51]:
labeled_df = create_label(
    df,
    threshold = 0.01,
    stop_loss = 0.005,
)

In [55]:
labeled_df

datetime,close,event_starts,event_ends,label
i64,f64,i64,i64,f64
1669901437418999808,17349.0,1669901437418999808,1669928551836000000,-0.028209
1669901437421999872,17314.0,1669901437421999872,1669928551836000000,-0.026245
1669901437908999936,17294.0,1669901437908999936,1669928551836000000,-0.025119
1669901438206000128,17291.0,1669901438206000128,1669928551836000000,-0.024949
1669901438608999936,17311.0,1669901438608999936,1669928551836000000,-0.026076
1669901438963000064,17295.5,1669901438963000064,1669928551836000000,-0.025203
1669901439115000064,17290.1,1669901439115000064,1669928551836000000,-0.024899
1669901439832999936,17273.1,1669901439832999936,1669928551836000000,-0.023939
1669901440464000000,17270.0,1669901440464000000,1669928551836000000,-0.023764
1669901440801000192,17259.7,1669901440801000192,1669928551836000000,-0.023181


In [66]:
import polars as pl
import numpy as np 

def drop_rare_labels(events, min_pct=0.05, min_classes=2):
    """
    Recursively drop labels with insufficient samples
    """
    while True:
        counts = events["label"].value_counts()
        counts = counts.with_column((pl.col("label")/pl.col("label").sum()).alias("rate"))
        if counts["rate"].min() > min_pct or counts.shape[0] <= min_classes:
            break
        events = events.filter(
            pl.col("label")!= counts[counts["rate"].arg_min(),"label"]
        )
    return events


def count_events_per_bar(bar_times, event_times):


    event_times = event_times.with_column(
        pl.col("event_ends").fill_null(event_times[-1,"event_ends"])
    )
    event_times_iloc1 = bar_times.select(
        pl.col("datetime").search_sorted(event_times[0,"event_starts"])
    ).to_numpy().flatten()[0]
    event_times_iloc2 = bar_times.select(
        pl.col("datetime").search_sorted(event_times["event_ends"].max())
    ).to_numpy().flatten()[0]
    res = pl.DataFrame(
        {
            "index":bar_times[event_times_iloc1:event_times_iloc2 + 1].to_numpy().flatten(),
            "values": np.zeros(event_times_iloc2-event_times_iloc1 + 1)
        }
    )
    for event_starts,event_ends in event_times.iterrows():
        res = res.with_column(
            pl.when((pl.col("index") >= event_starts) & (pl.col("index") <= event_ends))
            .then(pl.col("values")+1)
            .otherwise(pl.col("values"))
            .alias("values")
        )
    return res


def label_avg_uniqueness(bars, events):

    events_counts = count_events_per_bar(bars.select("datetime"), events)
    events_counts = events_counts.filter(~pl.col("index").is_duplicated())
    events_counts = events_counts.with_column(pl.col("values").fill_null(0))
    res = pl.DataFrame(
        {
            "index":events.select("event_starts").to_numpy().flatten(),
            "values": np.zeros(events.shape[0])
        }
    )
    for event_starts,event_ends in events.iterrows():
        res = res.with_column(
            pl.when((pl.col("index") == event_starts))
            .then((1.0 / events_counts.filter((pl.col("index")>= event_starts)&(pl.col("index")<=event_ends))["values"]).mean())
            .otherwise(pl.col("values"))
            .alias("values")
            )
    return res


def get_event_indicators(bar_times, event_times):
    dict1 = {str(i):np.zeros(bar_times.shape[0]) for i in range(event_times.shape[0])}
    dict1["index"] = bar_times["datetime"]
    res = pl.DataFrame(dict1)
    for i in range(event_times.shape[0]):
        res = res.with_column(
            pl.when((pl.col("index")>= event_times[i,"event_starts"])&(pl.col("index")<=event_times[i,"event_ends"]))
            .then(1)
            .otherwise(0)
            .alias(str(i))
            
        )
    return res

def _get_avg_uniqueness(event_indicators):
    event_indicators = event_indicators.select(pl.all().exclude("index"))
    concurrency = event_indicators.sum(axis=1)
    uniqueness =  event_indicators/concurrency
    uniqueness = uniqueness.fill_nan(0.0)
    avg_uniqueness = uniqueness.select(
        [
            pl.col(item).filter(pl.col(item)>0.0).mean()  for item in uniqueness.columns 
        ]
    )
    return avg_uniqueness


def sample_sequential_bootstrap(event_indicators, size=None):
    event_indicators = event_indicators.select(pl.all().exclude("index"))
    if size is None:
        size = event_indicators.shape[1]
    samples = []
    while len(samples) < size:
        trial_avg_uniq = dict() 
        for event_id in event_indicators.columns:
            new_samples = samples+[event_id]
            trial_event_indicators = event_indicators.select(
                [pl.col(new_samples[i]).alias(str(i)) for i in range(len(new_samples))]
            )
            trial_avg_uniq[event_id]  = _get_avg_uniqueness(trial_event_indicators)[:,-1].to_numpy()[-1]
        trial_avg_uniq_sum = sum(trial_avg_uniq.values())
        probs = [item / trial_avg_uniq_sum for item in trial_avg_uniq.values()]
        samples += [np.random.choice(event_indicators.columns, p=probs)]
    return samples

def _get_return_attributions(event_times, events_counts, bars):
    returns = bars.select([pl.col("Close").log().diff().alias("values"),pl.col("datetime").alias("index")])
    weights = pl.DataFrame(
        {
            "index":event_times.select("event_starts").to_numpy().flatten(),
            "values": np.zeros(event_times.shape[0])
        }
    )
    for event_starts,event_ends in event_times.iterrows():
        return_attributed = returns.filter(
            (pl.col("index")>=event_starts) & (pl.col("index")<=event_ends) 
        )["values"] / events_counts.filter(
            (pl.col("index")>=event_starts) & (pl.col("index")<=event_ends) 
        )["values"]
        weights = weights.with_column(
            pl.when(pl.col("index") == event_starts)
            .then(return_attributed.sum()).otherwise(pl.col("values")).alias("values")
        )
    weights = weights.select(
        [pl.col("index"),pl.col("values").abs()]
    )
    return weights

def compute_weights_by_returns(event_times, events_counts, bars):
    raw_weights = _get_return_attributions(event_times, events_counts, bars)
    norm_weights = raw_weights.select(
        [pl.col("index"),pl.col("values")/pl.col("values").sum()*raw_weights.shape[0]]
    )
    return norm_weights


def apply_time_decay_to_weights(avg_uniqueness, oldest_weight=1.0):
    cum_uniqueness = avg_uniqueness.select(
        [pl.col("index").sort(),pl.col("values").sort_by("index").cumsum()]
    )
    cum_uniqueness_last =  cum_uniqueness[-1,"values"]
    print(cum_uniqueness_last)
    if oldest_weight >= 0:
        slope = (1. - oldest_weight) / cum_uniqueness_last
    else:
        slope = 1. / ((oldest_weight + 1) * cum_uniqueness_last)
    const = 1. - slope * cum_uniqueness_last
    weights = cum_uniqueness.select([pl.col("index"),pl.col("values")*slope + const])
    weights = weights.select([pl.col("index"),pl.when(pl.col("values")>0).then(pl.col("values")).otherwise(0.0)])
    return weights




In [85]:
from joblib import Parallel, delayed
from tqdm import tqdm 
def get_event_indicators(bar_times, event_times, njobs = 4):
    res = bar_times.select(pl.col("datetime").alias("index"))
    params = [{"res":res,
               "event_starts":event_times[i,"event_starts"],
               "event_ends":event_times[i,"event_ends"],
              "i":i} for i in range(event_times.shape[0])]
    indicators = Parallel(n_jobs=njobs)(delayed(_get_event_indicator)(param) for param in tqdm(params))
    return pl.concat(indicators, how="horizontal")
    
def _get_event_indicator(params):
    return  params["res"].select(
            pl.when((pl.col("index")>= params["event_starts"])&(pl.col("index")<= params["event_ends"]))
            .then(1)
            .otherwise(0)
            .alias(str(params["i"]))
        )

In [345]:
%%time
np.random.seed(42)
events = labeled_df.select([pl.col("event_starts"),pl.col("event_ends")])
bars = labeled_df.select([pl.col("datetime"),pl.col("close")])

events_indicators = get_event_indicators(bars,events)
#sample = sample_sequential_bootstrap(events_indicators, 100)

100%|██████████████████████████████████████████████████| 13168/13168 [00:13<00:00, 942.12it/s]


CPU times: user 6.12 s, sys: 5.41 s, total: 11.5 s
Wall time: 17.2 s


In [295]:
events_indicators

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,...,13131,13132,13133,13134,13135,13136,13137,13138,13139,13140,13141,13142,13143,13144,13145,13146,13147,13148,13149,13150,13151,13152,13153,13154,13155,13156,13157,13158,13159,13160,13161,13162,13163,13164,13165,13166,13167
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,...,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [398]:
def _get_avg_uniqueness(event_indicators):
    print(event_indicators)
    uniqueness = event_indicators.select(
        (pl.col("new") / (pl.col("sum")+pl.col("new"))).alias("value")
    )
    print(uniqueness)
    avg_uniqueness = uniqueness.filter(pl.col("value")>0.0).mean()[0,0]
    print("avg_uniquess")
    print(avg_uniqueness)
    return avg_uniqueness


def sample_sequential_bootstrap(event_indicators, size=None):
    if size is None:
        size = event_indicators.shape[1]
    if size > event_indicators.shape[1]:
        size = event_indicators.shape[1]
        
    samples_sum = pl.DataFrame({"sum":np.zeros(event_indicators.shape[0])})
    sample_columns = [] 
    
    while len(sample_columns) < size:
        trial_uniqueness = pl.concat([event_indicators,samples_sum], how="horizontal")
        trial_uniqueness = trial_uniqueness.select([
            (pl.col(item)/(pl.col(item)+pl.col("sum"))).alias(item) for item in event_indicators.columns
        ])
        avg_uniqueness = trial_uniqueness.select(
            [
                pl.col(item).filter(pl.col(item)>0.0).mean()  for item in event_indicators.columns
            ]
        )
        ##不放回采样的独立性更高
        avg_uniqueness = avg_uniqueness.select([
            pl.when(item in sample_columns).then(0.0).otherwise(pl.col(item)).alias(item) for item in event_indicators.columns
        ])
        probs = (avg_uniqueness/avg_uniqueness.sum(axis=1)).row(0)
        idxs = [np.random.choice(event_indicators.columns, p=probs)]
        sample_columns += idxs
        samples_sum = pl.concat([samples_sum,event_indicators.select(idxs)], how="horizontal")
        samples_sum = samples_sum.sum(axis=1).to_frame(name="sum")
    return sorted([int(item) for item in sample_columns])

In [399]:
!export POLARS_MAX_THREADS=6

In [400]:
%%time
sample_sequential_bootstrap(events_indicators, size=100)

CPU times: user 5min 38s, sys: 8min 47s, total: 14min 26s
Wall time: 6min 50s


[139,
 203,
 300,
 317,
 351,
 389,
 739,
 753,
 797,
 839,
 1011,
 1208,
 1223,
 1226,
 1310,
 1407,
 1463,
 1482,
 1494,
 1576,
 1679,
 1908,
 1921,
 2029,
 2103,
 2148,
 2177,
 2234,
 2506,
 2800,
 2852,
 2935,
 2974,
 3120,
 3153,
 3438,
 3773,
 3896,
 4052,
 4164,
 4380,
 4391,
 4711,
 4712,
 4757,
 4882,
 5273,
 5652,
 5953,
 6139,
 6189,
 6381,
 6503,
 6627,
 6908,
 6934,
 6999,
 7005,
 7058,
 7208,
 7287,
 7347,
 7464,
 7606,
 7684,
 7685,
 7790,
 7838,
 7857,
 7866,
 7891,
 8070,
 8087,
 8157,
 8161,
 8360,
 8415,
 8638,
 8671,
 8748,
 9044,
 9216,
 9319,
 9519,
 9526,
 9836,
 10081,
 10167,
 10420,
 10748,
 10789,
 10880,
 11022,
 11512,
 12450,
 12537,
 12608,
 12631,
 12943,
 12969]

In [None]:
!export POLARS_MAX_THREADS=8

In [397]:
%%time
sample_sequential_bootstrap(events_indicators, size=100)

CPU times: user 5min 35s, sys: 8min 27s, total: 14min 3s
Wall time: 6min 42s


[77,
 86,
 203,
 294,
 415,
 717,
 1179,
 1183,
 1299,
 1405,
 1572,
 1584,
 1784,
 1846,
 2012,
 2089,
 2383,
 2730,
 2736,
 2899,
 2946,
 3006,
 3010,
 3028,
 3175,
 3179,
 3212,
 3453,
 3545,
 3897,
 3939,
 3957,
 4143,
 4462,
 4517,
 4872,
 5005,
 5023,
 5588,
 5901,
 5957,
 6147,
 6285,
 6308,
 6618,
 6710,
 6718,
 6857,
 6974,
 6989,
 7017,
 7023,
 7067,
 7279,
 7316,
 7348,
 7401,
 7432,
 7533,
 7679,
 7763,
 7809,
 7817,
 7994,
 7999,
 8055,
 8299,
 8454,
 8480,
 8516,
 8649,
 8758,
 8930,
 8941,
 9162,
 9200,
 9235,
 9239,
 9647,
 9839,
 9855,
 10019,
 10191,
 10254,
 10375,
 10498,
 10817,
 10949,
 10974,
 11277,
 11290,
 11624,
 11783,
 11871,
 12548,
 12715,
 12831,
 12897,
 13045,
 13085]

In [324]:
select_indicators = pl.DataFrame(np.array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0.],
       [0., 0., 0., 0., 1., 1., 1., 0.],
       [0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.]]))

In [325]:
select_indicators = select_indicators.select([
    pl.col(select_indicators.columns[i]).alias(str(i)) for i in range(len(select_indicators.columns))])

In [354]:
w = pl.DataFrame([np.random.choice([0.0,1.0]) for i in range(22)])

In [380]:
d1 = {item:np.random.choice([0.0,1.0]) for item in select_indicators.columns}

In [381]:
d1.values()

dict_values([0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0])

In [386]:
w = pl.Series([0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0])

0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0


In [378]:
w=pl.DataFrame({item: np.random.choice([0.0,1.0]) for select_indicators.columns})

SyntaxError: invalid syntax (1527010263.py, line 1)

In [392]:
select_indicators.mean().to_numpy()[0]

array([0.13636364, 0.13636364, 0.13636364, 0.13636364, 0.13636364,
       0.13636364, 0.13636364, 0.13636364])

In [344]:
%%time
sample_sequential_bootstrap(select_indicators, size=4)

(0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125)
(0.06976744186046512, 0.09302325581395349, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023)
(0.04838709677419354, 0.08064516129032256, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063)
(0.037037037037037035, 0.07407407407407407, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814)
CPU times: user 0 ns, sys: 15.6 ms, total: 15.6 ms
Wall time: 10.2 ms


[0, 0, 0, 0]

In [339]:
%%time
sample_sequential_bootstrap(select_indicators, size=4)

(0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125)
(0.06976744186046512, 0.09302325581395349, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023, 0.13953488372093023)
(0.04838709677419354, 0.08064516129032256, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063, 0.14516129032258063)
(0.037037037037037035, 0.07407407407407407, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814, 0.14814814814814814)
CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 10.9 ms


[0, 0, 0, 0]

In [165]:
pl.concat([x,w],how="horizontal")

AttributeError: 'Series' object has no attribute '_df'

In [169]:
w.to_frame(name="sum")

sum
i64
2
6


In [123]:
    event_indicators = pd.DataFrame([[2,-1],[3,-4]])
    concurrency = event_indicators.sum(axis=1)
    uniqueness = event_indicators.div(concurrency, axis=0)

In [124]:
 event_indicators

Unnamed: 0,0,1
0,2,-1
1,3,-4


In [125]:
  concurrency 

0    1
1   -1
dtype: int64

In [126]:
  uniqueness

Unnamed: 0,0,1
0,2.0,-1.0
1,-3.0,4.0
