In [4]:
import numpy as np
import pandas as pd

import pyarrow
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq

## Using pyarrow to load, and store parquet files

#### `Table.from_pydict`: construct `Table` from python objects

In [109]:
n_samples = 10_000_000

np.random.seed(123)
table = pa.Table.from_pydict({'x': np.random.random(n_samples),
                              'y': np.random.randint(0,100,n_samples)})

In [110]:
table

pyarrow.Table
x: double
y: int64
----
x: [[0.6964691855978616,0.28613933495037946,0.2268514535642031,0.5513147690828912,0.7194689697855631,...,0.2227801784298663,0.35029537651321796,0.4517732802081661,0.13689247607521016,0.444361037023138]]
y: [[99,61,69,94,44,...,22,83,48,57,61]]

#### `Table.write_table`: store a `Table` to disk

In [111]:
pq.write_table(table, '/tmp/foo.parquet')

#### `pq.ParquetFile`: Load a parquet as a `Table` 

In [227]:
pfile = pq.ParquetFile('/tmp/foo.parquet')
pfile

<pyarrow.parquet.core.ParquetFile at 0x7fe221e11ac0>

In [228]:
%timeit pfile = pq.ParquetFile('/tmp/foo.parquet')

84.2 µs ± 506 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


#### `ParquetFile.read`: Load a `Table` from a parquet file.

To read all groups from a parquet file one can  use `ParquetFile.read`

In [263]:
%%time
pfile = pq.ParquetFile('/tmp/foo.parquet')
pfile.read()

CPU times: user 66.3 ms, sys: 63.8 ms, total: 130 ms
Wall time: 89.3 ms


pyarrow.Table
x: double
y: int64
----
x: [[0.6964691855978616,0.28613933495037946,0.2268514535642031,0.5513147690828912,0.7194689697855631,...,0.2227801784298663,0.35029537651321796,0.4517732802081661,0.13689247607521016,0.444361037023138]]
y: [[99,61,69,94,44,...,22,83,48,57,61]]

In [266]:
%%time
df = pd.read_parquet('/tmp/foo.parquet')

CPU times: user 106 ms, sys: 53 ms, total: 159 ms
Wall time: 97.4 ms


pyarrow.Table
x: double
y: int64
----
x: [[0.6964691855978616,0.28613933495037946,0.2268514535642031,0.5513147690828912,0.7194689697855631,...,0.2227801784298663,0.35029537651321796,0.4517732802081661,0.13689247607521016,0.444361037023138]]
y: [[99,61,69,94,44,...,22,83,48,57,61]]

In [243]:
len(pfile.read())

10000000

In [None]:
pfile.read()

#### `ParquetFile.read_row_group`: Load a `Table` from a parquet file.

Because we did not set up a `row_group_size` when we stored the `.parquet` file it took the default value of `1024 * 1024` values per group.

In [175]:
%timeit row_group = pfile.read_row_group(0)

3.08 ms ± 26.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [171]:
row_group

pyarrow.Table
x: double
y: int64
----
x: [[0.6964691855978616,0.28613933495037946,0.2268514535642031,0.5513147690828912,0.7194689697855631,...,0.2227801784298663,0.35029537651321796,0.4517732802081661,0.13689247607521016,0.444361037023138]]
y: [[99,61,69,94,44,...,22,83,48,57,61]]

In [179]:
row_group = pfile.read_row_group(0)

In [180]:
len(row_group)

1048576

Note that the lenght of the row_group might not be the same as the length of the stored file

In [181]:
len(row_group)

1048576

#### `ParquetFile.read_row_groups([0,1,2,..])`: read a list of groups from a parquet

You can read more than a single gorup as follows:

In [247]:
pfile = pq.ParquetFile('/tmp/foo.parquet')
row_groups = pfile.read_row_groups([0,1,2,3,4,5,6,7,8,9])
print(len(row_groups))

10000000


To know how many groups a `ParquetFile` has one can use `.num_row_groups`

In [230]:
pfile.num_row_groups

10

In [231]:
len(pfile.read())

10000000

#### `Table.take` method: select row positions of a table

In [141]:
# Assuming you want indices 0,2.  This is just an example
row_group.take([0, 2])

pyarrow.Table
x: double
y: int64
----
x: [[0.6964691855978616,0.2268514535642031]]
y: [[99,69]]

### Comparing slicing time in `pd.DataFrame` vs `Table`

It is faster to slice a `Table` with respect to a `pd.DataFrame`

In [142]:
df = table.to_pandas()
np.random.seed(123)
indices = np.random.permutation(n_samples)[0:100]

In [143]:
%timeit df.iloc[indices]

48.3 µs ± 545 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [144]:
%timeit row_group.take(indices)

14.6 µs ± 51.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


Therefore, if you want to load a parquet, get an slice, and then store the slice to disk you can do so without the need to read a `pd.DataFrame`

In [234]:
row_group

pyarrow.Table
x: double
y: int64
----
x: [[0.6964691855978616,0.28613933495037946,0.2268514535642031,0.5513147690828912,0.7194689697855631,...,0.4138893037099889,0.431698841312794,0.29363193411649346,0.29617268524932616,0.9376098458389982]]
y: [[99,61,69,94,44,...,22,7,50,88,81]]

⚠ Note that if we add the time needed to get a pandas dataframe it is more costly to to use `.take(indices).to_pandas()` than directly slicing a pandas dataframe

In [235]:
%timeit row_groups.take(indices).to_pandas()

346 µs ± 13.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [238]:
row_groups.take(indices)['x'][0:10].to_numpy()

array([0.15493811, 0.44782737, 0.01141741, 0.27452967, 0.73234615,
       0.96431421, 0.42475137, 0.30092766, 0.39167553, 0.52504695])

In [239]:
row_groups.take(indices)['x'][0:10]

<pyarrow.lib.ChunkedArray object at 0x7fe262feeb30>
[
  [
    0.1549381105527371,
    0.4478273673116778,
    0.011417409860818184,
    0.2745296669043428,
    0.7323461513842601,
    0.9643142106942402,
    0.4247513663397099,
    0.30092765766236307,
    0.39167552623282553,
    0.5250469536801169
  ]
]

### Selecting a set of rows of a DataFrame, store the retrieved rows to disk
Consider you want to select  some rows of a dataframe based on values of some columns on 
the dataframe. 

Consider that doing this sort of  selection only depends on a ver

In [246]:
def filter_with_pandas(parquet_path_input, parquet_path_out):
    df = pd.read_parquet(parquet_path)
    row_bool = df['x'] >= 10
    return df[row_bool]

def filter_with_pyarrow(parquet_path):    
    data = pfile.read()
    data['x'] >= 0
    pass

parquet_file = '/tmp/foo.parquet'
parquet_file_out = '/tmp/foo_out.parquet'

In [None]:
pfile = pq.ParquetFile('/tmp/foo.parquet')
