# Parquet - Read / Write / Meta Data

## Installation

You need pyarrow for this Notebook.

```bash
pip install pyarrow
```

## Reading and Writing Parquet Files

More information
- [Reading](https://pandas.pydata.org/pandas-docs/version/1.1/reference/api/pandas.read_parquet.html#pandas.read_parquet)
- [Writing](https://pandas.pydata.org/pandas-docs/version/1.1/reference/api/pandas.DataFrame.to_parquet.html)

In [None]:
import pandas as pd

df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
df

In [None]:
#writing
df.to_parquet('df.parquet',compression=None)

In [None]:
#reading
pd.read_parquet('./df.parquet')

In [None]:
# reading one column
from pyarrow import parquet
parquet.read_table('./df.parquet', columns=['col2',]).to_pandas()

## Reading the metadata

In [None]:
from pyarrow import parquet

meta_data = parquet.read_metadata('./df.parquet')

In [None]:
meta_data

In [None]:
meta_data.schema

In [None]:
meta_data.to_dict()

# Encodings

## Run Length Encoding

In [None]:
# 1_000_000 is the same as 1000000
my_data = [1337 for x in range(1,1_000_000)]

my_data = my_data + [42, 42]

df = pd.DataFrame(data={'numbers': my_data})

In [None]:
df.to_csv('run_length_encoding.csv',compression=None, index=None)
df.to_parquet('run_length_encoding.parquet',compression=None)

In [None]:
!ls -lisah run_length_encoding.*

## Dictionary Encoding

In [None]:
my_data = ["hello world" for x in range(1,1_000_000)]

my_data = my_data + ["foo", "foo"]

df = pd.DataFrame(data={'strings': my_data})

In [None]:
df.to_csv('dictionary_encoding.csv',compression=None, index=None)
df.to_parquet('dictionary_encoding.parquet',compression=None)

In [None]:
!ls -lisah dictionary_encoding.*

## Delta Encoding (not supported with pyarrow)

In [None]:
import time    
epoch_time = int(time.time())

In [None]:
epoch_time

In [None]:
my_data = [epoch_time+x for x in range(1,1_000_000)]

df = pd.DataFrame(data={'ts': my_data})

In [None]:
df.to_csv('delta_encoding.csv',compression=None, index=None)
df.to_parquet('delta_encoding.parquet',compression=None)

In [None]:
!ls -lisah delta_encoding.*

In [None]:
parquet.read_metadata('./delta_encoding.parquet').to_dict()

## Compression

- [supported compressions](https://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.to_parquet.html)
- [parquet-format compressions](https://github.com/apache/parquet-format/blob/master/Compression.md)

In [None]:
import random

col_1 = [1337 for x in range(1,1_000_000)]
col_2 = [x+1 for x in range(1,1_000_000)]
col_3 = ["test" + str(x) for x in range(1,1_000_000)]
col_4 = [random.uniform(-1_000_000, 1_000_000) for x in range(1,1_000_000)]


df = pd.DataFrame(data={
    'col_1': col_1,
    'col_2': col_2,
    'col_3': col_3,
    'col_4': col_4,
    })

In [None]:
df

In [None]:
df.to_parquet('no_compression.parquet',compression=None)
df.to_parquet('snappy_compression.parquet',compression='SNAPPY')

In [None]:
!ls -lisah *_compression.parquet