# Parquet

## Simulate 100M rows of temperature monitoring from three sensors



In [1]:
import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
df = pd.DataFrame(data=np.random.randint(0,100,size=(100000000,3),dtype=np.uint8),    # values
             columns=['one','two','three'])
df.shape

(100000000, 3)

In [3]:
table = pa.Table.from_pandas(df,preserve_index=False) # you can skip the index column
table

pyarrow.Table
one: uint8
two: uint8
three: uint8
metadata
--------
OrderedDict([(b'pandas',
              b'{"index_columns": [], "column_indexes": [], "columns": [{"na'
              b'me": "one", "field_name": "one", "pandas_type": "uint8", "nu'
              b'mpy_type": "uint8", "metadata": null}, {"name": "two", "fiel'
              b'd_name": "two", "pandas_type": "uint8", "numpy_type": "uint8'
              b'", "metadata": null}, {"name": "three", "field_name": "three'
              b'", "pandas_type": "uint8", "numpy_type": "uint8", "metadata"'
              b': null}], "creator": {"library": "pyarrow", "version": "0.13'
              b'.0"}, "pandas_version": null}')])

In [4]:
%%time
#pq.write_table(table,'example.parquet')
pq.write_to_dataset(table,root_path='dataset.parquet',partition_cols=['one'],flavor='spark')

Wall time: 16 s


In [5]:
table2 = pq.read_table('dataset.parquet/',columns=['two']) # faster if you only load the columns you need
df = table2.to_pandas()
len(df)

100000000

In [8]:
parquet_file = pq.ParquetFile('dataset.parquet/one=0/23e435d1fb6b4b47b3f60403983dc651.parquet')
print(parquet_file.metadata)
print(parquet_file.schema)

<pyarrow._parquet.FileMetaData object at 0x000002681F9077C8>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 2
  num_rows: 999688
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 582
<pyarrow._parquet.ParquetSchema object at 0x000002681F97AEB8>
two: INT32 UINT_8
three: INT32 UINT_8
 


In [9]:
parquet_file.read_row_group(0).to_pandas()

Unnamed: 0,two,three
0,54,40
1,50,55
2,39,41
3,63,32
4,96,7
5,65,91
6,58,75
7,56,70
8,5,20
9,27,77
