In [None]:
import polars as pl
import os
import rtsvg
rt = rtsvg.RACETrack()

df = pl.concat([pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk1.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk2.csv'),
                pl.read_csv('../../data/2013_vast_challenge/mc3_netflow/nf/nf-chunk3.csv')])

print(f'{len(df)=:_}')
df.sample(3)

In [None]:
# 6.2s | 4.3s | 4.6s | 4.8s | 5.3s on m1 pro 16gb for 46_138_310
rt.xy(df, x_field='firstSeenSrcPayloadBytes', y_field='firstSeenDestPayloadBytes')

In [None]:
# This requires 4.9s | 4.7s ... this is the basic version of the above
w, h             = 256, 256
x_field, y_field = 'firstSeenSrcPayloadBytes', 'firstSeenDestPayloadBytes'
df_p = df.with_columns((w*(pl.col(x_field) - pl.col(x_field).min()) / (pl.col(x_field).max() - pl.col(x_field).min())).cast(pl.Int64).alias('__sx__'),
                       (h*(pl.col(y_field) - pl.col(y_field).min()) / (pl.col(y_field).max() - pl.col(y_field).min())).cast(pl.Int64).alias('__sy__')) \
         .group_by(['__sx__', '__sy__']).len()

In [None]:
# This requires 4.1s | 4.2s | 4.2s ... a little faster than the above but it mixes operations
xf_min,  xf_max  = df[x_field].min(), df[x_field].max()
yf_min,  yf_max  = df[y_field].min(), df[y_field].max()
xf_diff, yf_diff = xf_max - xf_min, yf_max - yf_min
df_p = df.with_columns((w*(pl.col(x_field) - xf_min) / xf_diff).cast(pl.Int64).alias('__sx__'),
                       (h*(pl.col(y_field) - yf_min) / yf_diff).cast(pl.Int64).alias('__sy__')) \
         .group_by(['__sx__', '__sy__']).len()

In [None]:
# 1.7s | 1.3s ... much faster than the above when using lazy dataframes
df_p = df.lazy() \
         .with_columns((w*(pl.col(x_field) - pl.col(x_field).min()) / (pl.col(x_field).max() - pl.col(x_field).min())).cast(pl.Int64).alias('__sx__'),
                       (h*(pl.col(y_field) - pl.col(y_field).min()) / (pl.col(y_field).max() - pl.col(y_field).min())).cast(pl.Int64).alias('__sy__')) \
         .group_by(['__sx__', '__sy__']).len() \
         .collect()

In [None]:
import re
def makePolarsStringConcat(s):
    matches = []
    for _match_ in re.findall(r'{[a-zA-Z0-9_\-. :]+}', s):
        _index_ = 0
        while _match_ in s[_index_:]:
            _index_ = s.index(_match_, _index_)
            matches.append((_index_, _match_))
            _index_ += 1
    matches  = sorted(matches)
    _params_ = [] 
    i = 0
    for _index_, _match_ in matches:
        _params_.append(pl.lit(s[i:_index_]))
        _params_.append(pl.col(_match_[1:-1]))
        i = _index_ + len(_match_)
    _params_.append(pl.lit(s[i:]))
    return _params_ 


In [None]:
# Most compact way of representing scatterplot elements
_svg_ = '''
<svg x="0" y="0" width="256" height="256" viewBox="0 0 10 10" xmlns="http://www.w3.org/2000/svg">
  <style> .rect-group rect { width:  1px; height: 1px; } </style>
  <rect x="0" y="0" width="256" height="256" fill="#ffffff" stroke="#ffffff" />
  <g class="rect-group" stroke="None" fill="#0000ff">
    <rect x="2" y="2"/>
    <rect x="4" y="3"/>
  </g>
</svg>
'''
rt.tile([_svg_])