## example

https://github.com/vaexio/vaex

In [1]:
import vaex
ds = vaex.example()  # open the example dataset provided with vaex

In [2]:
ds

#,E,FeH,L,Lz,random_index,vx,vy,vz,x,y,z
0,-121238.171875,-2.309227609164518,831.0799560546875,-336.426513671875,1511648,53.276722,288.386047,-95.2649078,-0.777470767,2.10626292,1.93743467
1,-100819.9140625,-1.788735491591229,1435.1839599609375,-828.7567749023438,2728665,252.810791,-69.9498444,-56.3121033,3.77427316,2.23387194,3.76209331
2,-100559.9609375,-0.7618109022478798,1039.2989501953125,920.802490234375,1202632,96.276474,226.440201,-34.7527161,1.3757627,-6.3283844,2.63250017
3,-70174.8515625,-1.5208778422936413,2441.724853515625,1183.5899658203125,1020502,204.968842,-205.679016,-58.9777031,-7.06737804,1.31737781,-6.10543537
4,-144138.75,-2.655341358427361,374.8164367675781,-314.5353088378906,3154816,-311.742371,-238.41217,186.824127,0.243441463,-0.822781682,-0.206593871
...,...,...,...,...,...,...,...,...,...,...,...
329995,-119687.3203125,-1.6499842518381402,746.8833618164062,-508.96484375,1919483,107.432999,-2.13771296,17.5130272,3.76883793,4.66251659,-4.42904139
329996,-68933.8046875,-1.4336036247720836,2395.633056640625,1275.490234375,1064141,32.0,108.089264,179.060638,9.17409325,-8.87091351,-8.61707687
329997,-112580.359375,-1.9306227597361942,1182.436279296875,115.58557891845703,374845,8.46711349,-38.2765236,-127.541473,-1.14041007,-8.4957695,2.25749826
329998,-74862.90625,-1.225019818838568,1324.5926513671875,1057.017333984375,425745,110.221558,-31.3925591,86.2726822,-14.2985935,-5.51750422,-8.65472317


In [3]:
ds_negative = ds[ds.x < 0]  # easily filter your dataset, without making a copy
ds_negative[:5][['x', 'y']]  # take the first five rows, and only the 'x' and 'y' column (no memory copy!)

#,x,y
0,-0.777471,2.10626
1,-7.06738,1.31738
2,-5.17174,7.82915
3,-15.9539,5.77126
4,-12.3995,13.9182


In [4]:
import numpy as np
# creates an expression (nothing is computed)
r = np.sqrt(ds.x**2 + ds.y**2 + ds.z**2)
r  # for convenience, we print out some values

Expression = sqrt((((x ** 2) + (y ** 2)) + (z ** 2)))
Length: 330,000 dtype: float64 (expression)
-------------------------------------------
     0   2.96555
     1   5.77829
     2    6.9908
     3   9.43184
     4  0.882561
      ...       
329995   7.45383
329996   15.3984
329997   8.86425
329998    17.601
329999   14.5402

In [5]:
ds['r'] = r  # add a (virtual) column that will be computed on the fly
ds.mean(ds.x), ds.mean(ds.r)  # calculate statistics on normal and virtual columns

(array(-0.06713149), array(9.40708234))

In [6]:
ds.mean(ds.r, binby=ds.x, shape=32, limits=[-10, 10]) # create statistics on a regular grid (1d)

array([15.01058183, 14.43693006, 13.72923338, 12.90294499, 11.86615103,
       11.03563695, 10.12162553,  9.2969267 ,  8.58250973,  7.86602644,
        7.19568442,  6.55738773,  6.01942499,  5.51462457,  5.15798991,
        4.8274218 ,  4.7346551 ,  5.1343761 ,  5.46017944,  6.02199777,
        6.54132124,  7.27025256,  7.99780777,  8.55188217,  9.30286584,
        9.97067561, 10.81633293, 11.60615795, 12.33813552, 13.10488982,
       13.86868565, 14.60577266])

In [7]:
ds.mean(ds.r, binby=[ds.x, ds.y], shape=32, limits=[-10, 10]) # or 2d
ds.count(ds.r, binby=[ds.x, ds.y], shape=32, limits=[-10, 10]) # or 2d counts/histogram

array([[22, 33, 37, ..., 58, 38, 45],
       [37, 36, 47, ..., 52, 36, 53],
       [34, 42, 47, ..., 59, 44, 56],
       ...,
       [73, 73, 84, ..., 41, 40, 37],
       [53, 58, 63, ..., 34, 35, 28],
       [51, 32, 46, ..., 47, 33, 36]])